From 55aebeb926b6f93a540328e7ac770ef536b09b77 Mon Sep 17 00:00:00 2001
From: Daniel Baluta <daniel.baluta@intel.com>
Date: Mon, 10 Nov 2014 14:45:30 +0200
Subject: iio: core: Introduce IIO_ACTIVITY channel

This channel will be used for exposing information about
activity composite sensors. Activities supported so far:
	* running
	* jogging
	* walking
	* still

THRESHOLD event is used to signal a change in the activity
state.

We associate a confidence interval for each activity expressed
as a percentage from 0 to 100.
  * 0, means the sensor IS NOT reporting that activity.
  * 100, means the sensor IS reporting that activity.

Users of this interface have two possible means to gather
information about the ongoing activities.

1. Event based, via event file descriptor
  * sensor may report an event when ENTERING an activity or LEAVING
    an activity based on a threshold value.
  * drivers will wake up applications waiting data on the event fd

2. Polling, by reading the sysfs associated attribute files:
  * /sys/bus/iio/devices/iio:device0/in_activity_running_input
expressed as percentage confidence value from 0 to 100.

This will offer an interface for Android significant motion
composite sensor defined here:
http://source.android.com/devices/sensors/composite_sensors.html

Activities listed above are supported by Freescale's MMA9553 sensor:
http://freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Daniel Baluta <daniel.baluta@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/types.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 4a2af8adf874..b3a241d53b54 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -30,6 +30,7 @@ enum iio_chan_type {
 	IIO_CCT,
 	IIO_PRESSURE,
 	IIO_HUMIDITYRELATIVE,
+	IIO_ACTIVITY,
 };
 
 enum iio_modifier {
@@ -59,7 +60,11 @@ enum iio_modifier {
 	IIO_MOD_NORTH_MAGN,
 	IIO_MOD_NORTH_TRUE,
 	IIO_MOD_NORTH_MAGN_TILT_COMP,
-	IIO_MOD_NORTH_TRUE_TILT_COMP
+	IIO_MOD_NORTH_TRUE_TILT_COMP,
+	IIO_MOD_RUNNING,
+	IIO_MOD_JOGGING,
+	IIO_MOD_WALKING,
+	IIO_MOD_STILL,
 };
 
 enum iio_event_type {
-- 
cgit v1.2.3


From 1843c2f3def16740eb6d129a9790c32dd21aa5ea Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Mon, 10 Nov 2014 14:45:31 +0200
Subject: iio: core: Introduce IIO_EV_DIR_NONE

For some events (e.g.: step detector) a direction does not make sense.

Add IIO_EV_DIR_NONE to be used with such events and generate sysfs event
attributes that do not contain direction.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Daniel Baluta <daniel.baluta@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-event.c | 12 +++++++++---
 include/linux/iio/types.h        |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index 0c1e37e3120a..1290290adcb5 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -327,9 +327,15 @@ static int iio_device_add_event(struct iio_dev *indio_dev,
 	for_each_set_bit(i, mask, sizeof(*mask)*8) {
 		if (i >= ARRAY_SIZE(iio_ev_info_text))
 			return -EINVAL;
-		postfix = kasprintf(GFP_KERNEL, "%s_%s_%s",
-				iio_ev_type_text[type], iio_ev_dir_text[dir],
-				iio_ev_info_text[i]);
+		if (dir != IIO_EV_DIR_NONE)
+			postfix = kasprintf(GFP_KERNEL, "%s_%s_%s",
+					iio_ev_type_text[type],
+					iio_ev_dir_text[dir],
+					iio_ev_info_text[i]);
+		else
+			postfix = kasprintf(GFP_KERNEL, "%s_%s",
+					iio_ev_type_text[type],
+					iio_ev_info_text[i]);
 		if (postfix == NULL)
 			return -ENOMEM;
 
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index b3a241d53b54..52cb5329407b 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -86,6 +86,7 @@ enum iio_event_direction {
 	IIO_EV_DIR_EITHER,
 	IIO_EV_DIR_RISING,
 	IIO_EV_DIR_FALLING,
+	IIO_EV_DIR_NONE,
 };
 
 #define IIO_VAL_INT 1
-- 
cgit v1.2.3


From a88bfe78583026eb9f21d4014ba481b22b66cee3 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Mon, 10 Nov 2014 14:45:32 +0200
Subject: iio: core: Introduce STEPS channel, ENABLE mask and INSTANCE event

These changes are needed to support the functionality of a pedometer.
A pedometer has two basic functionalities: step counter and step detector.

The step counter needs to be enabled and then it will count the steps
in its hardware register. Whenever the application needs to check
the step count, it will read the step counter register. To support the
step counter a new channel type STEPS is added. Since the pedometer needs
to be enabled first so that the hardware can count and store the steps,
we need a specific ENABLE channel info mask.

The step detector will generate an interrupt each time a step is detected.
To support this functionality we add a new event type INSTANCE.

For more information on the Android requirements for step counter and step
detector see:
http://source.android.com/devices/sensors/composite_sensors.html#counter
and http://source.android.com/devices/sensors/composite_sensors.html#detector.

A device that has the pedometer functionality this interface needs to
support is Freescale's MMA9553L:
http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Daniel Baluta <daniel.baluta@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 22 ++++++++++++++++++++++
 drivers/iio/industrialio-core.c         |  2 ++
 drivers/iio/industrialio-event.c        |  1 +
 include/linux/iio/iio.h                 |  1 +
 include/linux/iio/types.h               |  2 ++
 5 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 7bf49ad8fd82..c60b0a1af839 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -856,6 +856,13 @@ Description:
 		number or direction is not specified, applies to all channels of
 		this type.
 
+What:		/sys/.../events/in_steps_instance_en
+KernelVersion:	3.19
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Enables or disables step detection. Each time the user takes a step an
+		event of this type will be generated.
+
 What:		/sys/bus/iio/devices/iio:deviceX/trigger/current_trigger
 KernelVersion:	2.6.35
 Contact:	linux-iio@vger.kernel.org
@@ -1095,3 +1102,18 @@ Description:
 		after application of scale and offset. If no offset or scale is
 		present, output should be considered as processed with the
 		unit in milliamps.
+
+What:		/sys/.../iio:deviceX/in_steps_en
+KernelVersion:	3.19
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Activates the step counter. After activation, the number of steps
+		taken by the user will be counted in hardware and exported through
+		in_steps_input.
+
+What:		/sys/.../iio:deviceX/in_steps_input
+KernelVersion:	3.19
+Contact:	linux-iio@vger.kernel.org
+Description:
+		This attribute is used to read the number of steps taken by the user
+		since the last reboot while activated.
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index e453ef9e0c36..1e060f390b08 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -71,6 +71,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_PRESSURE] = "pressure",
 	[IIO_HUMIDITYRELATIVE] = "humidityrelative",
 	[IIO_ACTIVITY] = "activity",
+	[IIO_STEPS] = "steps",
 };
 
 static const char * const iio_modifier_names[] = {
@@ -118,6 +119,7 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_HARDWAREGAIN] = "hardwaregain",
 	[IIO_CHAN_INFO_HYSTERESIS] = "hysteresis",
 	[IIO_CHAN_INFO_INT_TIME] = "integration_time",
+	[IIO_CHAN_INFO_ENABLE] = "en",
 };
 
 /**
diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index 1290290adcb5..3f5cee0295c5 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -197,6 +197,7 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_ROC] = "roc",
 	[IIO_EV_TYPE_THRESH_ADAPTIVE] = "thresh_adaptive",
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
+	[IIO_EV_TYPE_INSTANCE] = "instance",
 };
 
 static const char * const iio_ev_dir_text[] = {
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 3642ce7ef512..f45a400a5e3e 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -38,6 +38,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_HARDWAREGAIN,
 	IIO_CHAN_INFO_HYSTERESIS,
 	IIO_CHAN_INFO_INT_TIME,
+	IIO_CHAN_INFO_ENABLE,
 };
 
 enum iio_shared_by {
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 52cb5329407b..904dcbbf0e6f 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -31,6 +31,7 @@ enum iio_chan_type {
 	IIO_PRESSURE,
 	IIO_HUMIDITYRELATIVE,
 	IIO_ACTIVITY,
+	IIO_STEPS,
 };
 
 enum iio_modifier {
@@ -73,6 +74,7 @@ enum iio_event_type {
 	IIO_EV_TYPE_ROC,
 	IIO_EV_TYPE_THRESH_ADAPTIVE,
 	IIO_EV_TYPE_MAG_ADAPTIVE,
+	IIO_EV_TYPE_INSTANCE,
 };
 
 enum iio_event_info {
-- 
cgit v1.2.3


From bcdf28fb1b8badf3cdba18d349f6251057e36a45 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Mon, 10 Nov 2014 14:45:33 +0200
Subject: iio: core: Introduce IIO_CHAN_INFO_CALIBHEIGHT

Some devices need the height of the user to compute various
parameters. One of this devices is Freescale's MMA9553L
(http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf)
that needs the height of the user to compute the stride length which
is used further to determine distance, speed and activity type.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Daniel Baluta <daniel.baluta@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 8 ++++++++
 drivers/iio/industrialio-core.c         | 1 +
 include/linux/iio/iio.h                 | 1 +
 3 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index c60b0a1af839..4a9e29a3c2dc 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -323,6 +323,14 @@ Description:
 		production inaccuracies).  If shared across all channels,
 		<type>_calibscale is used.
 
+What:		/sys/bus/iio/devices/iio:deviceX/in_steps_calibheight
+KernelVersion:	3.19
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Height of the user (in centimeters) used by some pedometers
+		to compute the stride length, distance, speed and activity
+		type.
+
 What:		/sys/bus/iio/devices/iio:deviceX/in_accel_scale_available
 What:		/sys/.../iio:deviceX/in_voltageX_scale_available
 What:		/sys/.../iio:deviceX/in_voltage-voltage_scale_available
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 1e060f390b08..45bb3a43afac 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -120,6 +120,7 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_HYSTERESIS] = "hysteresis",
 	[IIO_CHAN_INFO_INT_TIME] = "integration_time",
 	[IIO_CHAN_INFO_ENABLE] = "en",
+	[IIO_CHAN_INFO_CALIBHEIGHT] = "calibheight",
 };
 
 /**
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index f45a400a5e3e..878d861b0610 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -39,6 +39,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_HYSTERESIS,
 	IIO_CHAN_INFO_INT_TIME,
 	IIO_CHAN_INFO_ENABLE,
+	IIO_CHAN_INFO_CALIBHEIGHT,
 };
 
 enum iio_shared_by {
-- 
cgit v1.2.3


From f9380e7123863a4cb0627d940533be954a0a15df Mon Sep 17 00:00:00 2001
From: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Date: Thu, 27 Nov 2014 01:42:45 +0300
Subject: iio: inkern: add iio_write_channel_raw

Introduce API for easy in-kernel setting of DAC values.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/inkern.c         | 25 +++++++++++++++++++++++++
 include/linux/iio/consumer.h | 10 ++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c
index 866fe904cba2..21655fd1465c 100644
--- a/drivers/iio/inkern.c
+++ b/drivers/iio/inkern.c
@@ -631,3 +631,28 @@ err_unlock:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iio_get_channel_type);
+
+static int iio_channel_write(struct iio_channel *chan, int val, int val2,
+			     enum iio_chan_info_enum info)
+{
+	return chan->indio_dev->info->write_raw(chan->indio_dev,
+						chan->channel, val, val2, info);
+}
+
+int iio_write_channel_raw(struct iio_channel *chan, int val)
+{
+	int ret;
+
+	mutex_lock(&chan->indio_dev->info_exist_lock);
+	if (chan->indio_dev->info == NULL) {
+		ret = -ENODEV;
+		goto err_unlock;
+	}
+
+	ret = iio_channel_write(chan, val, 0, IIO_CHAN_INFO_RAW);
+err_unlock:
+	mutex_unlock(&chan->indio_dev->info_exist_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iio_write_channel_raw);
diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index 651f9a0e2765..6f64624f329b 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -150,6 +150,16 @@ int iio_read_channel_average_raw(struct iio_channel *chan, int *val);
  */
 int iio_read_channel_processed(struct iio_channel *chan, int *val);
 
+/**
+ * iio_write_channel_raw() - write to a given channel
+ * @chan:		The channel being queried.
+ * @val:		Value being written.
+ *
+ * Note raw writes to iio channels are in dac counts and hence
+ * scale will need to be applied if standard units required.
+ */
+int iio_write_channel_raw(struct iio_channel *chan, int val);
+
 /**
  * iio_get_channel_type() - get the type of a channel
  * @channel:		The channel being queried.
-- 
cgit v1.2.3


From 217a5cf0a1100264ced523e437e2e22c442dca7c Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 26 Nov 2014 18:55:09 +0100
Subject: iio: Unexport iio_scan_mask_set()

Individual drivers should not be messing with the scan mask that contains
the list of enabled channels. This is something that is supposed to be
managed by the core.

Now that the last few drivers that used it to configure a default scan mask
have been updated to not do this anymore we can unexport the function.

Note, this patch also requires moving a few functions around so they are all
declared before the first internal user.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Reviewed-by: Daniel Baluta <daniel.baluta@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-buffer.c | 149 +++++++++++++++++++-------------------
 include/linux/iio/buffer.h        |   9 ---
 2 files changed, 74 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index f971f79103ec..f667e4e7ea6d 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -178,6 +178,80 @@ static ssize_t iio_scan_el_show(struct device *dev,
 	return sprintf(buf, "%d\n", ret);
 }
 
+/* Note NULL used as error indicator as it doesn't make sense. */
+static const unsigned long *iio_scan_mask_match(const unsigned long *av_masks,
+					  unsigned int masklength,
+					  const unsigned long *mask)
+{
+	if (bitmap_empty(mask, masklength))
+		return NULL;
+	while (*av_masks) {
+		if (bitmap_subset(mask, av_masks, masklength))
+			return av_masks;
+		av_masks += BITS_TO_LONGS(masklength);
+	}
+	return NULL;
+}
+
+static bool iio_validate_scan_mask(struct iio_dev *indio_dev,
+	const unsigned long *mask)
+{
+	if (!indio_dev->setup_ops->validate_scan_mask)
+		return true;
+
+	return indio_dev->setup_ops->validate_scan_mask(indio_dev, mask);
+}
+
+/**
+ * iio_scan_mask_set() - set particular bit in the scan mask
+ * @indio_dev: the iio device
+ * @buffer: the buffer whose scan mask we are interested in
+ * @bit: the bit to be set.
+ *
+ * Note that at this point we have no way of knowing what other
+ * buffers might request, hence this code only verifies that the
+ * individual buffers request is plausible.
+ */
+static int iio_scan_mask_set(struct iio_dev *indio_dev,
+		      struct iio_buffer *buffer, int bit)
+{
+	const unsigned long *mask;
+	unsigned long *trialmask;
+
+	trialmask = kmalloc(sizeof(*trialmask)*
+			    BITS_TO_LONGS(indio_dev->masklength),
+			    GFP_KERNEL);
+
+	if (trialmask == NULL)
+		return -ENOMEM;
+	if (!indio_dev->masklength) {
+		WARN_ON("Trying to set scanmask prior to registering buffer\n");
+		goto err_invalid_mask;
+	}
+	bitmap_copy(trialmask, buffer->scan_mask, indio_dev->masklength);
+	set_bit(bit, trialmask);
+
+	if (!iio_validate_scan_mask(indio_dev, trialmask))
+		goto err_invalid_mask;
+
+	if (indio_dev->available_scan_masks) {
+		mask = iio_scan_mask_match(indio_dev->available_scan_masks,
+					   indio_dev->masklength,
+					   trialmask);
+		if (!mask)
+			goto err_invalid_mask;
+	}
+	bitmap_copy(buffer->scan_mask, trialmask, indio_dev->masklength);
+
+	kfree(trialmask);
+
+	return 0;
+
+err_invalid_mask:
+	kfree(trialmask);
+	return -EINVAL;
+}
+
 static int iio_scan_mask_clear(struct iio_buffer *buffer, int bit)
 {
 	clear_bit(bit, buffer->scan_mask);
@@ -455,21 +529,6 @@ ssize_t iio_buffer_show_enable(struct device *dev,
 }
 EXPORT_SYMBOL(iio_buffer_show_enable);
 
-/* Note NULL used as error indicator as it doesn't make sense. */
-static const unsigned long *iio_scan_mask_match(const unsigned long *av_masks,
-					  unsigned int masklength,
-					  const unsigned long *mask)
-{
-	if (bitmap_empty(mask, masklength))
-		return NULL;
-	while (*av_masks) {
-		if (bitmap_subset(mask, av_masks, masklength))
-			return av_masks;
-		av_masks += BITS_TO_LONGS(masklength);
-	}
-	return NULL;
-}
-
 static int iio_compute_scan_bytes(struct iio_dev *indio_dev,
 				const unsigned long *mask, bool timestamp)
 {
@@ -808,66 +867,6 @@ bool iio_validate_scan_mask_onehot(struct iio_dev *indio_dev,
 }
 EXPORT_SYMBOL_GPL(iio_validate_scan_mask_onehot);
 
-static bool iio_validate_scan_mask(struct iio_dev *indio_dev,
-	const unsigned long *mask)
-{
-	if (!indio_dev->setup_ops->validate_scan_mask)
-		return true;
-
-	return indio_dev->setup_ops->validate_scan_mask(indio_dev, mask);
-}
-
-/**
- * iio_scan_mask_set() - set particular bit in the scan mask
- * @indio_dev: the iio device
- * @buffer: the buffer whose scan mask we are interested in
- * @bit: the bit to be set.
- *
- * Note that at this point we have no way of knowing what other
- * buffers might request, hence this code only verifies that the
- * individual buffers request is plausible.
- */
-int iio_scan_mask_set(struct iio_dev *indio_dev,
-		      struct iio_buffer *buffer, int bit)
-{
-	const unsigned long *mask;
-	unsigned long *trialmask;
-
-	trialmask = kmalloc(sizeof(*trialmask)*
-			    BITS_TO_LONGS(indio_dev->masklength),
-			    GFP_KERNEL);
-
-	if (trialmask == NULL)
-		return -ENOMEM;
-	if (!indio_dev->masklength) {
-		WARN_ON("Trying to set scanmask prior to registering buffer\n");
-		goto err_invalid_mask;
-	}
-	bitmap_copy(trialmask, buffer->scan_mask, indio_dev->masklength);
-	set_bit(bit, trialmask);
-
-	if (!iio_validate_scan_mask(indio_dev, trialmask))
-		goto err_invalid_mask;
-
-	if (indio_dev->available_scan_masks) {
-		mask = iio_scan_mask_match(indio_dev->available_scan_masks,
-					   indio_dev->masklength,
-					   trialmask);
-		if (!mask)
-			goto err_invalid_mask;
-	}
-	bitmap_copy(buffer->scan_mask, trialmask, indio_dev->masklength);
-
-	kfree(trialmask);
-
-	return 0;
-
-err_invalid_mask:
-	kfree(trialmask);
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(iio_scan_mask_set);
-
 int iio_scan_mask_query(struct iio_dev *indio_dev,
 			struct iio_buffer *buffer, int bit)
 {
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 519392763393..8c8ce611949c 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -116,15 +116,6 @@ void iio_buffer_init(struct iio_buffer *buffer);
 int iio_scan_mask_query(struct iio_dev *indio_dev,
 			struct iio_buffer *buffer, int bit);
 
-/**
- * iio_scan_mask_set() - set particular bit in the scan mask
- * @indio_dev		IIO device structure
- * @buffer:		the buffer whose scan mask we are interested in
- * @bit:		the bit to be set.
- **/
-int iio_scan_mask_set(struct iio_dev *indio_dev,
-		      struct iio_buffer *buffer, int bit);
-
 /**
  * iio_push_to_buffers() - push to a registered buffer.
  * @indio_dev:		iio_dev structure for device.
-- 
cgit v1.2.3


From 3e1b6c95b990c93f4aa3b17e9f66221e2fa44bee Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 26 Nov 2014 18:55:12 +0100
Subject: iio: Move buffer registration to the core

Originally device and buffer registration were kept as separate operations
in IIO to allow to register two distinct sets of channels for buffered and
non-buffered operations. This has since already been further restricted and
the channel set registered for the buffer needs to be a subset of the
channel set registered for the device. Additionally the possibility to not
have a raw (or processed) attribute for a channel which was registered for
the device was added a while ago. This means it is possible to not register
any device level attributes for a channel even if it is registered for the
device. Also if a channel's scan_index is set to -1 and the channel is
registered for the buffer it is ignored.

So in summary it means it is possible to register the same channel array for
both the device and the buffer yet still end up with distinctive sets of
channels for both of them. This makes the argument for having to have to
manually register the channels for both the device and the buffer invalid.
Considering that the vast majority of all drivers want to register the same
set of channels for both the buffer and the device it makes sense to move
the buffer registration into the core to avoid some boiler-plate code in the
device driver setup path.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/adc/ti_am335x_adc.c                 |  9 ---------
 drivers/iio/iio_core.h                          |  9 +++++++++
 drivers/iio/industrialio-buffer.c               | 18 ++++++++++-------
 drivers/iio/industrialio-core.c                 | 14 ++++++++++++-
 drivers/iio/industrialio-triggered-buffer.c     | 11 +----------
 drivers/staging/iio/accel/lis3l02dq_core.c      | 13 +------------
 drivers/staging/iio/accel/sca3000_core.c        | 10 +---------
 drivers/staging/iio/iio_simple_dummy_buffer.c   |  8 --------
 drivers/staging/iio/impedance-analyzer/ad5933.c | 12 ++----------
 drivers/staging/iio/meter/ade7758.h             |  1 -
 drivers/staging/iio/meter/ade7758_core.c        | 15 ++------------
 drivers/staging/iio/meter/ade7758_ring.c        |  5 -----
 include/linux/iio/buffer.h                      | 26 -------------------------
 13 files changed, 40 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/adc/ti_am335x_adc.c b/drivers/iio/adc/ti_am335x_adc.c
index b730864731e8..d550ac7d2365 100644
--- a/drivers/iio/adc/ti_am335x_adc.c
+++ b/drivers/iio/adc/ti_am335x_adc.c
@@ -264,16 +264,8 @@ static int tiadc_iio_buffered_hardware_setup(struct iio_dev *indio_dev,
 	indio_dev->setup_ops = setup_ops;
 	indio_dev->modes |= INDIO_BUFFER_HARDWARE;
 
-	ret = iio_buffer_register(indio_dev,
-				  indio_dev->channels,
-				  indio_dev->num_channels);
-	if (ret)
-		goto error_free_irq;
-
 	return 0;
 
-error_free_irq:
-	free_irq(irq, indio_dev);
 error_kfifo_free:
 	iio_kfifo_free(indio_dev->buffer);
 	return ret;
@@ -285,7 +277,6 @@ static void tiadc_iio_buffered_hardware_remove(struct iio_dev *indio_dev)
 
 	free_irq(adc_dev->mfd_tscadc->irq, indio_dev);
 	iio_kfifo_free(indio_dev->buffer);
-	iio_buffer_unregister(indio_dev);
 }
 
 
diff --git a/drivers/iio/iio_core.h b/drivers/iio/iio_core.h
index 5f0ea77fe717..359883525ab7 100644
--- a/drivers/iio/iio_core.h
+++ b/drivers/iio/iio_core.h
@@ -48,6 +48,8 @@ unsigned int iio_buffer_poll(struct file *filp,
 ssize_t iio_buffer_read_first_n_outer(struct file *filp, char __user *buf,
 				      size_t n, loff_t *f_ps);
 
+int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev);
+void iio_buffer_free_sysfs_and_mask(struct iio_dev *indio_dev);
 
 #define iio_buffer_poll_addr (&iio_buffer_poll)
 #define iio_buffer_read_first_n_outer_addr (&iio_buffer_read_first_n_outer)
@@ -60,6 +62,13 @@ void iio_buffer_wakeup_poll(struct iio_dev *indio_dev);
 #define iio_buffer_poll_addr NULL
 #define iio_buffer_read_first_n_outer_addr NULL
 
+static inline int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
+{
+	return 0;
+}
+
+static inline void iio_buffer_free_sysfs_and_mask(struct iio_dev *indio_dev) {}
+
 static inline void iio_disable_all_buffers(struct iio_dev *indio_dev) {}
 static inline void iio_buffer_wakeup_poll(struct iio_dev *indio_dev) {}
 
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index f667e4e7ea6d..8bb3e64eaf2c 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -385,14 +385,16 @@ static int iio_buffer_add_channel_sysfs(struct iio_dev *indio_dev,
 
 static const char * const iio_scan_elements_group_name = "scan_elements";
 
-int iio_buffer_register(struct iio_dev *indio_dev,
-			const struct iio_chan_spec *channels,
-			int num_channels)
+int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 {
 	struct iio_dev_attr *p;
 	struct attribute **attr;
 	struct iio_buffer *buffer = indio_dev->buffer;
 	int ret, i, attrn, attrcount, attrcount_orig = 0;
+	const struct iio_chan_spec *channels;
+
+	if (!buffer)
+		return 0;
 
 	if (buffer->attrs)
 		indio_dev->groups[indio_dev->groupcounter++] = buffer->attrs;
@@ -404,9 +406,10 @@ int iio_buffer_register(struct iio_dev *indio_dev,
 	}
 	attrcount = attrcount_orig;
 	INIT_LIST_HEAD(&buffer->scan_el_dev_attr_list);
+	channels = indio_dev->channels;
 	if (channels) {
 		/* new magic */
-		for (i = 0; i < num_channels; i++) {
+		for (i = 0; i < indio_dev->num_channels; i++) {
 			if (channels[i].scan_index < 0)
 				continue;
 
@@ -463,15 +466,16 @@ error_cleanup_dynamic:
 
 	return ret;
 }
-EXPORT_SYMBOL(iio_buffer_register);
 
-void iio_buffer_unregister(struct iio_dev *indio_dev)
+void iio_buffer_free_sysfs_and_mask(struct iio_dev *indio_dev)
 {
+	if (!indio_dev->buffer)
+		return;
+
 	kfree(indio_dev->buffer->scan_mask);
 	kfree(indio_dev->buffer->scan_el_group.attrs);
 	iio_free_chan_devattr_list(&indio_dev->buffer->scan_el_dev_attr_list);
 }
-EXPORT_SYMBOL(iio_buffer_unregister);
 
 ssize_t iio_buffer_read_length(struct device *dev,
 			       struct device_attribute *attr,
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 45bb3a43afac..ee442ee482ab 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1158,11 +1158,19 @@ int iio_device_register(struct iio_dev *indio_dev)
 			"Failed to register debugfs interfaces\n");
 		return ret;
 	}
+
+	ret = iio_buffer_alloc_sysfs_and_mask(indio_dev);
+	if (ret) {
+		dev_err(indio_dev->dev.parent,
+			"Failed to create buffer sysfs interfaces\n");
+		goto error_unreg_debugfs;
+	}
+
 	ret = iio_device_register_sysfs(indio_dev);
 	if (ret) {
 		dev_err(indio_dev->dev.parent,
 			"Failed to register sysfs interfaces\n");
-		goto error_unreg_debugfs;
+		goto error_buffer_free_sysfs;
 	}
 	ret = iio_device_register_eventset(indio_dev);
 	if (ret) {
@@ -1195,6 +1203,8 @@ error_unreg_eventset:
 	iio_device_unregister_eventset(indio_dev);
 error_free_sysfs:
 	iio_device_unregister_sysfs(indio_dev);
+error_buffer_free_sysfs:
+	iio_buffer_free_sysfs_and_mask(indio_dev);
 error_unreg_debugfs:
 	iio_device_unregister_debugfs(indio_dev);
 	return ret;
@@ -1223,6 +1233,8 @@ void iio_device_unregister(struct iio_dev *indio_dev)
 	iio_buffer_wakeup_poll(indio_dev);
 
 	mutex_unlock(&indio_dev->info_exist_lock);
+
+	iio_buffer_free_sysfs_and_mask(indio_dev);
 }
 EXPORT_SYMBOL(iio_device_unregister);
 
diff --git a/drivers/iio/industrialio-triggered-buffer.c b/drivers/iio/industrialio-triggered-buffer.c
index d6f54930b34a..61a5d0404edf 100644
--- a/drivers/iio/industrialio-triggered-buffer.c
+++ b/drivers/iio/industrialio-triggered-buffer.c
@@ -32,7 +32,7 @@ static const struct iio_buffer_setup_ops iio_triggered_buffer_setup_ops = {
  *
  * This function combines some common tasks which will normally be performed
  * when setting up a triggered buffer. It will allocate the buffer and the
- * pollfunc, as well as register the buffer with the IIO core.
+ * pollfunc.
  *
  * Before calling this function the indio_dev structure should already be
  * completely initialized, but not yet registered. In practice this means that
@@ -78,16 +78,8 @@ int iio_triggered_buffer_setup(struct iio_dev *indio_dev,
 	/* Flag that polled ring buffering is possible */
 	indio_dev->modes |= INDIO_BUFFER_TRIGGERED;
 
-	ret = iio_buffer_register(indio_dev,
-				  indio_dev->channels,
-				  indio_dev->num_channels);
-	if (ret)
-		goto error_dealloc_pollfunc;
-
 	return 0;
 
-error_dealloc_pollfunc:
-	iio_dealloc_pollfunc(indio_dev->pollfunc);
 error_kfifo_free:
 	iio_kfifo_free(indio_dev->buffer);
 error_ret:
@@ -101,7 +93,6 @@ EXPORT_SYMBOL(iio_triggered_buffer_setup);
  */
 void iio_triggered_buffer_cleanup(struct iio_dev *indio_dev)
 {
-	iio_buffer_unregister(indio_dev);
 	iio_dealloc_pollfunc(indio_dev->pollfunc);
 	iio_kfifo_free(indio_dev->buffer);
 }
diff --git a/drivers/staging/iio/accel/lis3l02dq_core.c b/drivers/staging/iio/accel/lis3l02dq_core.c
index f5e145caffa9..b78c9c5d5588 100644
--- a/drivers/staging/iio/accel/lis3l02dq_core.c
+++ b/drivers/staging/iio/accel/lis3l02dq_core.c
@@ -716,14 +716,6 @@ static int lis3l02dq_probe(struct spi_device *spi)
 	if (ret)
 		return ret;
 
-	ret = iio_buffer_register(indio_dev,
-				  lis3l02dq_channels,
-				  ARRAY_SIZE(lis3l02dq_channels));
-	if (ret) {
-		dev_err(&spi->dev, "failed to initialize the buffer\n");
-		goto error_unreg_buffer_funcs;
-	}
-
 	if (spi->irq) {
 		ret = request_threaded_irq(st->us->irq,
 					   &lis3l02dq_th,
@@ -732,7 +724,7 @@ static int lis3l02dq_probe(struct spi_device *spi)
 					   "lis3l02dq",
 					   indio_dev);
 		if (ret)
-			goto error_uninitialize_buffer;
+			goto error_unreg_buffer_funcs;
 
 		ret = lis3l02dq_probe_trigger(indio_dev);
 		if (ret)
@@ -756,8 +748,6 @@ error_remove_trigger:
 error_free_interrupt:
 	if (spi->irq)
 		free_irq(st->us->irq, indio_dev);
-error_uninitialize_buffer:
-	iio_buffer_unregister(indio_dev);
 error_unreg_buffer_funcs:
 	lis3l02dq_unconfigure_buffer(indio_dev);
 	return ret;
@@ -804,7 +794,6 @@ static int lis3l02dq_remove(struct spi_device *spi)
 		free_irq(st->us->irq, indio_dev);
 
 	lis3l02dq_remove_trigger(indio_dev);
-	iio_buffer_unregister(indio_dev);
 	lis3l02dq_unconfigure_buffer(indio_dev);
 
 	return 0;
diff --git a/drivers/staging/iio/accel/sca3000_core.c b/drivers/staging/iio/accel/sca3000_core.c
index aef8c916e07b..9cd04c7147c8 100644
--- a/drivers/staging/iio/accel/sca3000_core.c
+++ b/drivers/staging/iio/accel/sca3000_core.c
@@ -1156,11 +1156,6 @@ static int sca3000_probe(struct spi_device *spi)
 	if (ret < 0)
 		return ret;
 
-	ret = iio_buffer_register(indio_dev, indio_dev->channels,
-				  indio_dev->num_channels);
-	if (ret < 0)
-		goto error_unregister_dev;
-
 	if (spi->irq) {
 		ret = request_threaded_irq(spi->irq,
 					   NULL,
@@ -1169,7 +1164,7 @@ static int sca3000_probe(struct spi_device *spi)
 					   "sca3000",
 					   indio_dev);
 		if (ret)
-			goto error_unregister_ring;
+			goto error_unregister_dev;
 	}
 	sca3000_register_ring_funcs(indio_dev);
 	ret = sca3000_clean_setup(st);
@@ -1180,8 +1175,6 @@ static int sca3000_probe(struct spi_device *spi)
 error_free_irq:
 	if (spi->irq)
 		free_irq(spi->irq, indio_dev);
-error_unregister_ring:
-	iio_buffer_unregister(indio_dev);
 error_unregister_dev:
 	iio_device_unregister(indio_dev);
 	return ret;
@@ -1215,7 +1208,6 @@ static int sca3000_remove(struct spi_device *spi)
 	if (spi->irq)
 		free_irq(spi->irq, indio_dev);
 	iio_device_unregister(indio_dev);
-	iio_buffer_unregister(indio_dev);
 	sca3000_unconfigure_ring(indio_dev);
 
 	return 0;
diff --git a/drivers/staging/iio/iio_simple_dummy_buffer.c b/drivers/staging/iio/iio_simple_dummy_buffer.c
index 35d60d5045dc..a2d72c102119 100644
--- a/drivers/staging/iio/iio_simple_dummy_buffer.c
+++ b/drivers/staging/iio/iio_simple_dummy_buffer.c
@@ -172,15 +172,8 @@ int iio_simple_dummy_configure_buffer(struct iio_dev *indio_dev)
 	 */
 	indio_dev->modes |= INDIO_BUFFER_TRIGGERED;
 
-	ret = iio_buffer_register(indio_dev, indio_dev->channels,
-				  indio_dev->num_channels);
-	if (ret)
-		goto error_dealloc_pollfunc;
-
 	return 0;
 
-error_dealloc_pollfunc:
-	iio_dealloc_pollfunc(indio_dev->pollfunc);
 error_free_buffer:
 	iio_kfifo_free(indio_dev->buffer);
 error_ret:
@@ -194,7 +187,6 @@ error_ret:
  */
 void iio_simple_dummy_unconfigure_buffer(struct iio_dev *indio_dev)
 {
-	iio_buffer_unregister(indio_dev);
 	iio_dealloc_pollfunc(indio_dev->pollfunc);
 	iio_kfifo_free(indio_dev->buffer);
 }
diff --git a/drivers/staging/iio/impedance-analyzer/ad5933.c b/drivers/staging/iio/impedance-analyzer/ad5933.c
index aa6a368506cd..c50b1380b9aa 100644
--- a/drivers/staging/iio/impedance-analyzer/ad5933.c
+++ b/drivers/staging/iio/impedance-analyzer/ad5933.c
@@ -752,23 +752,16 @@ static int ad5933_probe(struct i2c_client *client,
 	if (ret)
 		goto error_disable_reg;
 
-	ret = iio_buffer_register(indio_dev, ad5933_channels,
-		ARRAY_SIZE(ad5933_channels));
-	if (ret)
-		goto error_unreg_ring;
-
 	ret = ad5933_setup(st);
 	if (ret)
-		goto error_uninitialize_ring;
+		goto error_unreg_ring;
 
 	ret = iio_device_register(indio_dev);
 	if (ret)
-		goto error_uninitialize_ring;
+		goto error_unreg_ring;
 
 	return 0;
 
-error_uninitialize_ring:
-	iio_buffer_unregister(indio_dev);
 error_unreg_ring:
 	iio_kfifo_free(indio_dev->buffer);
 error_disable_reg:
@@ -784,7 +777,6 @@ static int ad5933_remove(struct i2c_client *client)
 	struct ad5933_state *st = iio_priv(indio_dev);
 
 	iio_device_unregister(indio_dev);
-	iio_buffer_unregister(indio_dev);
 	iio_kfifo_free(indio_dev->buffer);
 	if (!IS_ERR(st->reg))
 		regulator_disable(st->reg);
diff --git a/drivers/staging/iio/meter/ade7758.h b/drivers/staging/iio/meter/ade7758.h
index 07318203a836..762d7dc0e6e2 100644
--- a/drivers/staging/iio/meter/ade7758.h
+++ b/drivers/staging/iio/meter/ade7758.h
@@ -146,7 +146,6 @@ ssize_t ade7758_read_data_from_ring(struct device *dev,
 int ade7758_configure_ring(struct iio_dev *indio_dev);
 void ade7758_unconfigure_ring(struct iio_dev *indio_dev);
 
-void ade7758_uninitialize_ring(struct iio_dev *indio_dev);
 int ade7758_set_irq(struct device *dev, bool enable);
 
 int ade7758_spi_write_reg_8(struct device *dev,
diff --git a/drivers/staging/iio/meter/ade7758_core.c b/drivers/staging/iio/meter/ade7758_core.c
index abc60067cd72..6e8b01104a68 100644
--- a/drivers/staging/iio/meter/ade7758_core.c
+++ b/drivers/staging/iio/meter/ade7758_core.c
@@ -885,23 +885,15 @@ static int ade7758_probe(struct spi_device *spi)
 	if (ret)
 		goto error_free_tx;
 
-	ret = iio_buffer_register(indio_dev,
-				  &ade7758_channels[0],
-				  ARRAY_SIZE(ade7758_channels));
-	if (ret) {
-		dev_err(&spi->dev, "failed to initialize the ring\n");
-		goto error_unreg_ring_funcs;
-	}
-
 	/* Get the device into a sane initial state */
 	ret = ade7758_initial_setup(indio_dev);
 	if (ret)
-		goto error_uninitialize_ring;
+		goto error_unreg_ring_funcs;
 
 	if (spi->irq) {
 		ret = ade7758_probe_trigger(indio_dev);
 		if (ret)
-			goto error_uninitialize_ring;
+			goto error_unreg_ring_funcs;
 	}
 
 	ret = iio_device_register(indio_dev);
@@ -913,8 +905,6 @@ static int ade7758_probe(struct spi_device *spi)
 error_remove_trigger:
 	if (spi->irq)
 		ade7758_remove_trigger(indio_dev);
-error_uninitialize_ring:
-	ade7758_uninitialize_ring(indio_dev);
 error_unreg_ring_funcs:
 	ade7758_unconfigure_ring(indio_dev);
 error_free_tx:
@@ -932,7 +922,6 @@ static int ade7758_remove(struct spi_device *spi)
 	iio_device_unregister(indio_dev);
 	ade7758_stop_device(&indio_dev->dev);
 	ade7758_remove_trigger(indio_dev);
-	ade7758_uninitialize_ring(indio_dev);
 	ade7758_unconfigure_ring(indio_dev);
 	kfree(st->tx);
 	kfree(st->rx);
diff --git a/drivers/staging/iio/meter/ade7758_ring.c b/drivers/staging/iio/meter/ade7758_ring.c
index c0accf8cce93..27c3ed6ca468 100644
--- a/drivers/staging/iio/meter/ade7758_ring.c
+++ b/drivers/staging/iio/meter/ade7758_ring.c
@@ -181,8 +181,3 @@ error_iio_kfifo_free:
 	iio_kfifo_free(indio_dev->buffer);
 	return ret;
 }
-
-void ade7758_uninitialize_ring(struct iio_dev *indio_dev)
-{
-	iio_buffer_unregister(indio_dev);
-}
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 8c8ce611949c..b0e006c3db43 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -150,22 +150,6 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev,
 
 int iio_update_demux(struct iio_dev *indio_dev);
 
-/**
- * iio_buffer_register() - register the buffer with IIO core
- * @indio_dev:		device with the buffer to be registered
- * @channels:		the channel descriptions used to construct buffer
- * @num_channels:	the number of channels
- **/
-int iio_buffer_register(struct iio_dev *indio_dev,
-			const struct iio_chan_spec *channels,
-			int num_channels);
-
-/**
- * iio_buffer_unregister() - unregister the buffer from IIO core
- * @indio_dev:		the device with the buffer to be unregistered
- **/
-void iio_buffer_unregister(struct iio_dev *indio_dev);
-
 /**
  * iio_buffer_read_length() - attr func to get number of datums in the buffer
  **/
@@ -223,16 +207,6 @@ static inline void iio_device_attach_buffer(struct iio_dev *indio_dev,
 
 #else /* CONFIG_IIO_BUFFER */
 
-static inline int iio_buffer_register(struct iio_dev *indio_dev,
-					   const struct iio_chan_spec *channels,
-					   int num_channels)
-{
-	return 0;
-}
-
-static inline void iio_buffer_unregister(struct iio_dev *indio_dev)
-{}
-
 static inline void iio_buffer_get(struct iio_buffer *buffer) {}
 static inline void iio_buffer_put(struct iio_buffer *buffer) {}
 
-- 
cgit v1.2.3


From 616dde2a1ea3df9398b1fcc7d6d6516c5fab6183 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 26 Nov 2014 18:55:13 +0100
Subject: iio: Remove get_bytes_per_datum() from iio_buffer_access_funcs

There haven't been any users of the get_bytes_per_datum() callback for a
while. The core assumes that the number of bytes per datum can be calculated
based on the enabled channels and the storage size of the channel and
iio_compute_scan_bytes() is used to compute this number. So remove the
callback.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/kfifo_buf.c                    | 6 ------
 drivers/staging/iio/Documentation/ring.txt | 4 ++--
 drivers/staging/iio/accel/sca3000_ring.c   | 7 -------
 include/linux/iio/buffer.h                 | 2 --
 4 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/kfifo_buf.c b/drivers/iio/kfifo_buf.c
index 7134e8ada09a..1258b4e0a722 100644
--- a/drivers/iio/kfifo_buf.c
+++ b/drivers/iio/kfifo_buf.c
@@ -66,11 +66,6 @@ static struct attribute_group iio_kfifo_attribute_group = {
 	.name = "buffer",
 };
 
-static int iio_get_bytes_per_datum_kfifo(struct iio_buffer *r)
-{
-	return r->bytes_per_datum;
-}
-
 static int iio_mark_update_needed_kfifo(struct iio_buffer *r)
 {
 	struct iio_kfifo *kf = iio_to_kfifo(r);
@@ -159,7 +154,6 @@ static const struct iio_buffer_access_funcs kfifo_access_funcs = {
 	.read_first_n = &iio_read_first_n_kfifo,
 	.data_available = iio_kfifo_buf_data_available,
 	.request_update = &iio_request_update_kfifo,
-	.get_bytes_per_datum = &iio_get_bytes_per_datum_kfifo,
 	.set_bytes_per_datum = &iio_set_bytes_per_datum_kfifo,
 	.get_length = &iio_get_length_kfifo,
 	.set_length = &iio_set_length_kfifo,
diff --git a/drivers/staging/iio/Documentation/ring.txt b/drivers/staging/iio/Documentation/ring.txt
index e1da43381d0e..434d63a6123a 100644
--- a/drivers/staging/iio/Documentation/ring.txt
+++ b/drivers/staging/iio/Documentation/ring.txt
@@ -39,8 +39,8 @@ request_update
   If parameters have changed that require reinitialization or configuration of
   the buffer this will trigger it.
 
-get_bytes_per_datum, set_bytes_per_datum
-  Get/set the number of bytes for a complete scan. (All samples + timestamp)
+set_bytes_per_datum
+  Set the number of bytes for a complete scan. (All samples + timestamp)
 
 get_length / set_length
   Get/set the number of complete scans that may be held by the buffer.
diff --git a/drivers/staging/iio/accel/sca3000_ring.c b/drivers/staging/iio/accel/sca3000_ring.c
index 157827651bfa..aa0e5d85bfe2 100644
--- a/drivers/staging/iio/accel/sca3000_ring.c
+++ b/drivers/staging/iio/accel/sca3000_ring.c
@@ -135,12 +135,6 @@ static int sca3000_ring_get_length(struct iio_buffer *r)
 	return 64;
 }
 
-/* only valid if resolution is kept at 11bits */
-static int sca3000_ring_get_bytes_per_datum(struct iio_buffer *r)
-{
-	return 6;
-}
-
 static bool sca3000_ring_buf_data_available(struct iio_buffer *r)
 {
 	return r->stufftoread;
@@ -278,7 +272,6 @@ static void sca3000_ring_release(struct iio_buffer *r)
 static const struct iio_buffer_access_funcs sca3000_ring_access_funcs = {
 	.read_first_n = &sca3000_read_first_n_hw_rb,
 	.get_length = &sca3000_ring_get_length,
-	.get_bytes_per_datum = &sca3000_ring_get_bytes_per_datum,
 	.data_available = sca3000_ring_buf_data_available,
 	.release = sca3000_ring_release,
 };
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index b0e006c3db43..79cdb3d1b1bc 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -25,7 +25,6 @@ struct iio_buffer;
  *			available.
  * @request_update:	if a parameter change has been marked, update underlying
  *			storage.
- * @get_bytes_per_datum:get current bytes per datum
  * @set_bytes_per_datum:set number of bytes per datum
  * @get_length:		get number of datums in buffer
  * @set_length:		set number of datums in buffer
@@ -49,7 +48,6 @@ struct iio_buffer_access_funcs {
 
 	int (*request_update)(struct iio_buffer *buffer);
 
-	int (*get_bytes_per_datum)(struct iio_buffer *buffer);
 	int (*set_bytes_per_datum)(struct iio_buffer *buffer, size_t bpd);
 	int (*get_length)(struct iio_buffer *buffer);
 	int (*set_length)(struct iio_buffer *buffer, int length);
-- 
cgit v1.2.3


From 08e7e0adaa17205f86894157d86c4bee3c714330 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 26 Nov 2014 18:55:15 +0100
Subject: iio: buffer: Allocate standard attributes in the core

All buffers want at least the length and the enable attribute. Move the
creation of those attributes to the core instead of having to do this in
each individual buffer implementation. This allows us to get rid of some
boiler-plate code.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-buffer.c        | 59 ++++++++++++++++++++++----------
 drivers/iio/kfifo_buf.c                  | 15 --------
 drivers/staging/iio/accel/sca3000_ring.c | 14 ++------
 include/linux/iio/buffer.h               | 37 ++------------------
 4 files changed, 45 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 8cd89eb269c5..ba89357fc096 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -383,9 +383,9 @@ static int iio_buffer_add_channel_sysfs(struct iio_dev *indio_dev,
 	return ret;
 }
 
-ssize_t iio_buffer_read_length(struct device *dev,
-			       struct device_attribute *attr,
-			       char *buf)
+static ssize_t iio_buffer_read_length(struct device *dev,
+				      struct device_attribute *attr,
+				      char *buf)
 {
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
 	struct iio_buffer *buffer = indio_dev->buffer;
@@ -396,12 +396,10 @@ ssize_t iio_buffer_read_length(struct device *dev,
 
 	return 0;
 }
-EXPORT_SYMBOL(iio_buffer_read_length);
 
-ssize_t iio_buffer_write_length(struct device *dev,
-				struct device_attribute *attr,
-				const char *buf,
-				size_t len)
+static ssize_t iio_buffer_write_length(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t len)
 {
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
 	struct iio_buffer *buffer = indio_dev->buffer;
@@ -428,16 +426,14 @@ ssize_t iio_buffer_write_length(struct device *dev,
 
 	return ret ? ret : len;
 }
-EXPORT_SYMBOL(iio_buffer_write_length);
 
-ssize_t iio_buffer_show_enable(struct device *dev,
-			       struct device_attribute *attr,
-			       char *buf)
+static ssize_t iio_buffer_show_enable(struct device *dev,
+				      struct device_attribute *attr,
+				      char *buf)
 {
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
 	return sprintf(buf, "%d\n", iio_buffer_is_active(indio_dev->buffer));
 }
-EXPORT_SYMBOL(iio_buffer_show_enable);
 
 static int iio_compute_scan_bytes(struct iio_dev *indio_dev,
 				const unsigned long *mask, bool timestamp)
@@ -724,10 +720,10 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(iio_update_buffers);
 
-ssize_t iio_buffer_store_enable(struct device *dev,
-				struct device_attribute *attr,
-				const char *buf,
-				size_t len)
+static ssize_t iio_buffer_store_enable(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf,
+				       size_t len)
 {
 	int ret;
 	bool requested_state;
@@ -759,10 +755,14 @@ done:
 	mutex_unlock(&indio_dev->mlock);
 	return (ret < 0) ? ret : len;
 }
-EXPORT_SYMBOL(iio_buffer_store_enable);
 
 static const char * const iio_scan_elements_group_name = "scan_elements";
 
+static DEVICE_ATTR(length, S_IRUGO | S_IWUSR, iio_buffer_read_length,
+		   iio_buffer_write_length);
+static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
+		   iio_buffer_show_enable, iio_buffer_store_enable);
+
 int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 {
 	struct iio_dev_attr *p;
@@ -774,6 +774,27 @@ int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 	if (!buffer)
 		return 0;
 
+	attrcount = 0;
+	if (buffer->attrs) {
+		while (buffer->attrs[attrcount] != NULL)
+			attrcount++;
+	}
+
+	buffer->buffer_group.name = "buffer";
+	buffer->buffer_group.attrs = kcalloc(attrcount + 3,
+			sizeof(*buffer->buffer_group.attrs), GFP_KERNEL);
+	if (!buffer->buffer_group.attrs)
+		return -ENOMEM;
+
+	buffer->buffer_group.attrs[0] = &dev_attr_length.attr;
+	buffer->buffer_group.attrs[1] = &dev_attr_enable.attr;
+	if (buffer->attrs)
+		memcpy(&buffer->buffer_group.attrs[2], buffer->attrs,
+			sizeof(*&buffer->buffer_group.attrs) * (attrcount - 2));
+	buffer->buffer_group.attrs[attrcount+2] = NULL;
+
+	indio_dev->groups[indio_dev->groupcounter++] = &buffer->buffer_group;
+
 	if (buffer->scan_el_attrs != NULL) {
 		attr = buffer->scan_el_attrs->attrs;
 		while (*attr++ != NULL)
@@ -838,6 +859,7 @@ error_free_scan_mask:
 	kfree(buffer->scan_mask);
 error_cleanup_dynamic:
 	iio_free_chan_devattr_list(&buffer->scan_el_dev_attr_list);
+	kfree(indio_dev->buffer->buffer_group.attrs);
 
 	return ret;
 }
@@ -848,6 +870,7 @@ void iio_buffer_free_sysfs_and_mask(struct iio_dev *indio_dev)
 		return;
 
 	kfree(indio_dev->buffer->scan_mask);
+	kfree(indio_dev->buffer->buffer_group.attrs);
 	kfree(indio_dev->buffer->scan_el_group.attrs);
 	iio_free_chan_devattr_list(&indio_dev->buffer->scan_el_dev_attr_list);
 }
diff --git a/drivers/iio/kfifo_buf.c b/drivers/iio/kfifo_buf.c
index 1258b4e0a722..3b0a3bc4f0ad 100644
--- a/drivers/iio/kfifo_buf.c
+++ b/drivers/iio/kfifo_buf.c
@@ -52,20 +52,6 @@ static int iio_get_length_kfifo(struct iio_buffer *r)
 	return r->length;
 }
 
-static IIO_BUFFER_ENABLE_ATTR;
-static IIO_BUFFER_LENGTH_ATTR;
-
-static struct attribute *iio_kfifo_attributes[] = {
-	&dev_attr_length.attr,
-	&dev_attr_enable.attr,
-	NULL,
-};
-
-static struct attribute_group iio_kfifo_attribute_group = {
-	.attrs = iio_kfifo_attributes,
-	.name = "buffer",
-};
-
 static int iio_mark_update_needed_kfifo(struct iio_buffer *r)
 {
 	struct iio_kfifo *kf = iio_to_kfifo(r);
@@ -169,7 +155,6 @@ struct iio_buffer *iio_kfifo_allocate(struct iio_dev *indio_dev)
 		return NULL;
 	kf->update_needed = true;
 	iio_buffer_init(&kf->buffer);
-	kf->buffer.attrs = &iio_kfifo_attribute_group;
 	kf->buffer.access = &kfifo_access_funcs;
 	kf->buffer.length = 2;
 	mutex_init(&kf->user_lock);
diff --git a/drivers/staging/iio/accel/sca3000_ring.c b/drivers/staging/iio/accel/sca3000_ring.c
index aa0e5d85bfe2..f2f260e01550 100644
--- a/drivers/staging/iio/accel/sca3000_ring.c
+++ b/drivers/staging/iio/accel/sca3000_ring.c
@@ -140,9 +140,6 @@ static bool sca3000_ring_buf_data_available(struct iio_buffer *r)
 	return r->stufftoread;
 }
 
-static IIO_BUFFER_ENABLE_ATTR;
-static IIO_BUFFER_LENGTH_ATTR;
-
 /**
  * sca3000_query_ring_int() is the hardware ring status interrupt enabled
  **/
@@ -232,20 +229,13 @@ static IIO_DEVICE_ATTR(in_accel_scale,
  * only apply to the ring buffer.  At all times full rate and accuracy
  * is available via direct reading from registers.
  */
-static struct attribute *sca3000_ring_attributes[] = {
-	&dev_attr_length.attr,
-	&dev_attr_enable.attr,
+static const struct attribute *sca3000_ring_attributes[] = {
 	&iio_dev_attr_50_percent.dev_attr.attr,
 	&iio_dev_attr_75_percent.dev_attr.attr,
 	&iio_dev_attr_in_accel_scale.dev_attr.attr,
 	NULL,
 };
 
-static struct attribute_group sca3000_ring_attr = {
-	.attrs = sca3000_ring_attributes,
-	.name = "buffer",
-};
-
 static struct iio_buffer *sca3000_rb_allocate(struct iio_dev *indio_dev)
 {
 	struct iio_buffer *buf;
@@ -258,7 +248,7 @@ static struct iio_buffer *sca3000_rb_allocate(struct iio_dev *indio_dev)
 	ring->private = indio_dev;
 	buf = &ring->buf;
 	buf->stufftoread = 0;
-	buf->attrs = &sca3000_ring_attr;
+	buf->attrs = sca3000_ring_attributes;
 	iio_buffer_init(buf);
 
 	return buf;
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 79cdb3d1b1bc..16b7663036f2 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -83,10 +83,11 @@ struct iio_buffer {
 	bool					scan_timestamp;
 	const struct iio_buffer_access_funcs	*access;
 	struct list_head			scan_el_dev_attr_list;
+	struct attribute_group			buffer_group;
 	struct attribute_group			scan_el_group;
 	wait_queue_head_t			pollq;
 	bool					stufftoread;
-	const struct attribute_group *attrs;
+	const struct attribute			**attrs;
 	struct list_head			demux_list;
 	void					*demux_bounce;
 	struct list_head			buffer_list;
@@ -148,40 +149,6 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev,
 
 int iio_update_demux(struct iio_dev *indio_dev);
 
-/**
- * iio_buffer_read_length() - attr func to get number of datums in the buffer
- **/
-ssize_t iio_buffer_read_length(struct device *dev,
-			       struct device_attribute *attr,
-			       char *buf);
-/**
- * iio_buffer_write_length() - attr func to set number of datums in the buffer
- **/
-ssize_t iio_buffer_write_length(struct device *dev,
-			      struct device_attribute *attr,
-			      const char *buf,
-			      size_t len);
-/**
- * iio_buffer_store_enable() - attr to turn the buffer on
- **/
-ssize_t iio_buffer_store_enable(struct device *dev,
-				struct device_attribute *attr,
-				const char *buf,
-				size_t len);
-/**
- * iio_buffer_show_enable() - attr to see if the buffer is on
- **/
-ssize_t iio_buffer_show_enable(struct device *dev,
-			       struct device_attribute *attr,
-			       char *buf);
-#define IIO_BUFFER_LENGTH_ATTR DEVICE_ATTR(length, S_IRUGO | S_IWUSR,	\
-					   iio_buffer_read_length,	\
-					   iio_buffer_write_length)
-
-#define IIO_BUFFER_ENABLE_ATTR DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,	\
-					   iio_buffer_show_enable,	\
-					   iio_buffer_store_enable)
-
 bool iio_validate_scan_mask_onehot(struct iio_dev *indio_dev,
 	const unsigned long *mask);
 
-- 
cgit v1.2.3


From 374956600ecbedf5ca29c76bde114160eb805091 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 26 Nov 2014 18:55:17 +0100
Subject: iio: buffer: Drop get_length callback

We already do have the length field in the struct iio_buffer which is
expected to be in sync with the current size of the buffer. And currently
all implementations of the get_length callback either return this field or a
constant number.

This patch removes the get_length callback and replaces all occurrences in
the IIO core with directly accessing the length field of the buffer.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-buffer.c          | 11 +++--------
 drivers/iio/kfifo_buf.c                    |  6 ------
 drivers/staging/iio/Documentation/ring.txt |  4 ++--
 drivers/staging/iio/accel/sca3000_ring.c   |  8 +-------
 include/linux/iio/buffer.h                 |  2 --
 5 files changed, 6 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 4ca4c0a09923..2bd8d399f2ec 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -390,11 +390,7 @@ static ssize_t iio_buffer_read_length(struct device *dev,
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
 	struct iio_buffer *buffer = indio_dev->buffer;
 
-	if (buffer->access->get_length)
-		return sprintf(buf, "%d\n",
-			       buffer->access->get_length(buffer));
-
-	return 0;
+	return sprintf(buf, "%d\n", buffer->length);
 }
 
 static ssize_t iio_buffer_write_length(struct device *dev,
@@ -410,9 +406,8 @@ static ssize_t iio_buffer_write_length(struct device *dev,
 	if (ret)
 		return ret;
 
-	if (buffer->access->get_length)
-		if (val == buffer->access->get_length(buffer))
-			return len;
+	if (val == buffer->length)
+		return len;
 
 	mutex_lock(&indio_dev->mlock);
 	if (iio_buffer_is_active(indio_dev->buffer)) {
diff --git a/drivers/iio/kfifo_buf.c b/drivers/iio/kfifo_buf.c
index 3b0a3bc4f0ad..b20a9cfbc8ed 100644
--- a/drivers/iio/kfifo_buf.c
+++ b/drivers/iio/kfifo_buf.c
@@ -47,11 +47,6 @@ static int iio_request_update_kfifo(struct iio_buffer *r)
 	return ret;
 }
 
-static int iio_get_length_kfifo(struct iio_buffer *r)
-{
-	return r->length;
-}
-
 static int iio_mark_update_needed_kfifo(struct iio_buffer *r)
 {
 	struct iio_kfifo *kf = iio_to_kfifo(r);
@@ -141,7 +136,6 @@ static const struct iio_buffer_access_funcs kfifo_access_funcs = {
 	.data_available = iio_kfifo_buf_data_available,
 	.request_update = &iio_request_update_kfifo,
 	.set_bytes_per_datum = &iio_set_bytes_per_datum_kfifo,
-	.get_length = &iio_get_length_kfifo,
 	.set_length = &iio_set_length_kfifo,
 	.release = &iio_kfifo_buffer_release,
 };
diff --git a/drivers/staging/iio/Documentation/ring.txt b/drivers/staging/iio/Documentation/ring.txt
index 434d63a6123a..18718fcaf259 100644
--- a/drivers/staging/iio/Documentation/ring.txt
+++ b/drivers/staging/iio/Documentation/ring.txt
@@ -42,6 +42,6 @@ request_update
 set_bytes_per_datum
   Set the number of bytes for a complete scan. (All samples + timestamp)
 
-get_length / set_length
-  Get/set the number of complete scans that may be held by the buffer.
+set_length
+  Set the number of complete scans that may be held by the buffer.
 
diff --git a/drivers/staging/iio/accel/sca3000_ring.c b/drivers/staging/iio/accel/sca3000_ring.c
index f2f260e01550..f76a26885808 100644
--- a/drivers/staging/iio/accel/sca3000_ring.c
+++ b/drivers/staging/iio/accel/sca3000_ring.c
@@ -129,12 +129,6 @@ error_ret:
 	return ret ? ret : num_read;
 }
 
-/* This is only valid with all 3 elements enabled */
-static int sca3000_ring_get_length(struct iio_buffer *r)
-{
-	return 64;
-}
-
 static bool sca3000_ring_buf_data_available(struct iio_buffer *r)
 {
 	return r->stufftoread;
@@ -248,6 +242,7 @@ static struct iio_buffer *sca3000_rb_allocate(struct iio_dev *indio_dev)
 	ring->private = indio_dev;
 	buf = &ring->buf;
 	buf->stufftoread = 0;
+	buf->length = 64;
 	buf->attrs = sca3000_ring_attributes;
 	iio_buffer_init(buf);
 
@@ -261,7 +256,6 @@ static void sca3000_ring_release(struct iio_buffer *r)
 
 static const struct iio_buffer_access_funcs sca3000_ring_access_funcs = {
 	.read_first_n = &sca3000_read_first_n_hw_rb,
-	.get_length = &sca3000_ring_get_length,
 	.data_available = sca3000_ring_buf_data_available,
 	.release = sca3000_ring_release,
 };
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 16b7663036f2..b65850a41127 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -26,7 +26,6 @@ struct iio_buffer;
  * @request_update:	if a parameter change has been marked, update underlying
  *			storage.
  * @set_bytes_per_datum:set number of bytes per datum
- * @get_length:		get number of datums in buffer
  * @set_length:		set number of datums in buffer
  * @release:		called when the last reference to the buffer is dropped,
  *			should free all resources allocated by the buffer.
@@ -49,7 +48,6 @@ struct iio_buffer_access_funcs {
 	int (*request_update)(struct iio_buffer *buffer);
 
 	int (*set_bytes_per_datum)(struct iio_buffer *buffer, size_t bpd);
-	int (*get_length)(struct iio_buffer *buffer);
 	int (*set_length)(struct iio_buffer *buffer, int length);
 
 	void (*release)(struct iio_buffer *buffer);
-- 
cgit v1.2.3


From 09546a30632fd35996373146657d5a0296fd37ca Mon Sep 17 00:00:00 2001
From: "Ivan T. Ivanov" <iivanov@mm-sol.com>
Date: Tue, 23 Sep 2014 15:51:42 +0300
Subject: iio: consumer.h: Fix scale factor in function comment

1 milivolt is equal to 1000000 nanovolts.

Signed-off-by: Ivan T. Ivanov <iivanov@mm-sol.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/consumer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index 6f64624f329b..26fb8f6342bb 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -201,7 +201,7 @@ int iio_read_channel_scale(struct iio_channel *chan, int *val,
  * The scale factor allows to increase the precession of the returned value. For
  * a scale factor of 1 the function will return the result in the normal IIO
  * unit for the channel type. E.g. millivolt for voltage channels, if you want
- * nanovolts instead pass 1000 as the scale factor.
+ * nanovolts instead pass 1000000 as the scale factor.
  */
 int iio_convert_raw_to_processed(struct iio_channel *chan, int raw,
 	int *processed, unsigned int scale);
-- 
cgit v1.2.3


From 3a2c0ba5ad00c018c0bef39a2224aca950aa33f2 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 8 Dec 2014 16:50:37 +0800
Subject: hwrng: use reference counts on each struct hwrng.

current_rng holds one reference, and we bump it every time we want
to do a read from it.

This means we only hold the rng_mutex to grab or drop a reference,
so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't
block on read of /dev/hwrng.

Using a kref is overkill (we're always under the rng_mutex), but
a standard pattern.

This also solves the problem that the hwrng_fillfn thread was
accessing current_rng without a lock, which could change (eg. to NULL)
underneath it.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Amos Kong <akong@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/core.c | 135 ++++++++++++++++++++++++++++--------------
 include/linux/hw_random.h     |   2 +
 2 files changed, 94 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index b4c0e873d362..089c18dc579e 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -42,6 +42,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/random.h>
+#include <linux/err.h>
 #include <asm/uaccess.h>
 
 
@@ -91,6 +92,60 @@ static void add_early_randomness(struct hwrng *rng)
 		add_device_randomness(bytes, bytes_read);
 }
 
+static inline void cleanup_rng(struct kref *kref)
+{
+	struct hwrng *rng = container_of(kref, struct hwrng, ref);
+
+	if (rng->cleanup)
+		rng->cleanup(rng);
+}
+
+static void set_current_rng(struct hwrng *rng)
+{
+	BUG_ON(!mutex_is_locked(&rng_mutex));
+	kref_get(&rng->ref);
+	current_rng = rng;
+}
+
+static void drop_current_rng(void)
+{
+	BUG_ON(!mutex_is_locked(&rng_mutex));
+	if (!current_rng)
+		return;
+
+	/* decrease last reference for triggering the cleanup */
+	kref_put(&current_rng->ref, cleanup_rng);
+	current_rng = NULL;
+}
+
+/* Returns ERR_PTR(), NULL or refcounted hwrng */
+static struct hwrng *get_current_rng(void)
+{
+	struct hwrng *rng;
+
+	if (mutex_lock_interruptible(&rng_mutex))
+		return ERR_PTR(-ERESTARTSYS);
+
+	rng = current_rng;
+	if (rng)
+		kref_get(&rng->ref);
+
+	mutex_unlock(&rng_mutex);
+	return rng;
+}
+
+static void put_rng(struct hwrng *rng)
+{
+	/*
+	 * Hold rng_mutex here so we serialize in case they set_current_rng
+	 * on rng again immediately.
+	 */
+	mutex_lock(&rng_mutex);
+	if (rng)
+		kref_put(&rng->ref, cleanup_rng);
+	mutex_unlock(&rng_mutex);
+}
+
 static inline int hwrng_init(struct hwrng *rng)
 {
 	if (rng->init) {
@@ -113,12 +168,6 @@ static inline int hwrng_init(struct hwrng *rng)
 	return 0;
 }
 
-static inline void hwrng_cleanup(struct hwrng *rng)
-{
-	if (rng && rng->cleanup)
-		rng->cleanup(rng);
-}
-
 static int rng_dev_open(struct inode *inode, struct file *filp)
 {
 	/* enforce read-only access to this chrdev */
@@ -154,21 +203,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf,
 	ssize_t ret = 0;
 	int err = 0;
 	int bytes_read, len;
+	struct hwrng *rng;
 
 	while (size) {
-		if (mutex_lock_interruptible(&rng_mutex)) {
-			err = -ERESTARTSYS;
+		rng = get_current_rng();
+		if (IS_ERR(rng)) {
+			err = PTR_ERR(rng);
 			goto out;
 		}
-
-		if (!current_rng) {
+		if (!rng) {
 			err = -ENODEV;
-			goto out_unlock;
+			goto out;
 		}
 
 		mutex_lock(&reading_mutex);
 		if (!data_avail) {
-			bytes_read = rng_get_data(current_rng, rng_buffer,
+			bytes_read = rng_get_data(rng, rng_buffer,
 				rng_buffer_size(),
 				!(filp->f_flags & O_NONBLOCK));
 			if (bytes_read < 0) {
@@ -200,8 +250,8 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf,
 			ret += len;
 		}
 
-		mutex_unlock(&rng_mutex);
 		mutex_unlock(&reading_mutex);
+		put_rng(rng);
 
 		if (need_resched())
 			schedule_timeout_interruptible(1);
@@ -213,12 +263,11 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf,
 	}
 out:
 	return ret ? : err;
-out_unlock:
-	mutex_unlock(&rng_mutex);
-	goto out;
+
 out_unlock_reading:
 	mutex_unlock(&reading_mutex);
-	goto out_unlock;
+	put_rng(rng);
+	goto out;
 }
 
 
@@ -257,8 +306,8 @@ static ssize_t hwrng_attr_current_store(struct device *dev,
 			err = hwrng_init(rng);
 			if (err)
 				break;
-			hwrng_cleanup(current_rng);
-			current_rng = rng;
+			drop_current_rng();
+			set_current_rng(rng);
 			err = 0;
 			break;
 		}
@@ -272,17 +321,15 @@ static ssize_t hwrng_attr_current_show(struct device *dev,
 				       struct device_attribute *attr,
 				       char *buf)
 {
-	int err;
 	ssize_t ret;
-	const char *name = "none";
+	struct hwrng *rng;
 
-	err = mutex_lock_interruptible(&rng_mutex);
-	if (err)
-		return -ERESTARTSYS;
-	if (current_rng)
-		name = current_rng->name;
-	ret = snprintf(buf, PAGE_SIZE, "%s\n", name);
-	mutex_unlock(&rng_mutex);
+	rng = get_current_rng();
+	if (IS_ERR(rng))
+		return PTR_ERR(rng);
+
+	ret = snprintf(buf, PAGE_SIZE, "%s\n", rng ? rng->name : "none");
+	put_rng(rng);
 
 	return ret;
 }
@@ -353,12 +400,16 @@ static int hwrng_fillfn(void *unused)
 	long rc;
 
 	while (!kthread_should_stop()) {
-		if (!current_rng)
+		struct hwrng *rng;
+
+		rng = get_current_rng();
+		if (IS_ERR(rng) || !rng)
 			break;
 		mutex_lock(&reading_mutex);
-		rc = rng_get_data(current_rng, rng_fillbuf,
+		rc = rng_get_data(rng, rng_fillbuf,
 				  rng_buffer_size(), 1);
 		mutex_unlock(&reading_mutex);
+		put_rng(rng);
 		if (rc <= 0) {
 			pr_warn("hwrng: no data available\n");
 			msleep_interruptible(10000);
@@ -419,14 +470,13 @@ int hwrng_register(struct hwrng *rng)
 		err = hwrng_init(rng);
 		if (err)
 			goto out_unlock;
-		current_rng = rng;
+		set_current_rng(rng);
 	}
 	err = 0;
 	if (!old_rng) {
 		err = register_miscdev();
 		if (err) {
-			hwrng_cleanup(rng);
-			current_rng = NULL;
+			drop_current_rng();
 			goto out_unlock;
 		}
 	}
@@ -453,22 +503,21 @@ EXPORT_SYMBOL_GPL(hwrng_register);
 
 void hwrng_unregister(struct hwrng *rng)
 {
-	int err;
-
 	mutex_lock(&rng_mutex);
 
 	list_del(&rng->list);
 	if (current_rng == rng) {
-		hwrng_cleanup(rng);
-		if (list_empty(&rng_list)) {
-			current_rng = NULL;
-		} else {
-			current_rng = list_entry(rng_list.prev, struct hwrng, list);
-			err = hwrng_init(current_rng);
-			if (err)
-				current_rng = NULL;
+		drop_current_rng();
+		if (!list_empty(&rng_list)) {
+			struct hwrng *tail;
+
+			tail = list_entry(rng_list.prev, struct hwrng, list);
+
+			if (hwrng_init(tail) == 0)
+				set_current_rng(tail);
 		}
 	}
+
 	if (list_empty(&rng_list)) {
 		mutex_unlock(&rng_mutex);
 		unregister_miscdev();
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index 914bb08cd738..c212e71ea886 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -14,6 +14,7 @@
 
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/kref.h>
 
 /**
  * struct hwrng - Hardware Random Number Generator driver
@@ -44,6 +45,7 @@ struct hwrng {
 
 	/* internal. */
 	struct list_head list;
+	struct kref ref;
 };
 
 /** Register a new Hardware Random Number Generator driver. */
-- 
cgit v1.2.3


From a027f30d72f2c4d27d6dd9bd053205d3102de7d1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 8 Dec 2014 16:50:38 +0800
Subject: hwrng: fix unregister race.

The previous patch added one potential problem: we can still be
reading from a hwrng when it's unregistered.  Add a wait for zero
in the hwrng_unregister path.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Amos Kong <akong@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/core.c | 12 ++++++++++++
 include/linux/hw_random.h     |  1 +
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 089c18dc579e..8d609a026465 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex);
 static DEFINE_MUTEX(reading_mutex);
 static int data_avail;
 static u8 *rng_buffer, *rng_fillbuf;
+static DECLARE_WAIT_QUEUE_HEAD(rng_done);
 static unsigned short current_quality;
 static unsigned short default_quality; /* = 0; default to "off" */
 
@@ -98,6 +99,11 @@ static inline void cleanup_rng(struct kref *kref)
 
 	if (rng->cleanup)
 		rng->cleanup(rng);
+
+	/* cleanup_done should be updated after cleanup finishes */
+	smp_wmb();
+	rng->cleanup_done = true;
+	wake_up_all(&rng_done);
 }
 
 static void set_current_rng(struct hwrng *rng)
@@ -494,6 +500,8 @@ int hwrng_register(struct hwrng *rng)
 		add_early_randomness(rng);
 	}
 
+	rng->cleanup_done = false;
+
 out_unlock:
 	mutex_unlock(&rng_mutex);
 out:
@@ -525,6 +533,10 @@ void hwrng_unregister(struct hwrng *rng)
 			kthread_stop(hwrng_fill);
 	} else
 		mutex_unlock(&rng_mutex);
+
+	/* Just in case rng is reading right now, wait. */
+	wait_event(rng_done, rng->cleanup_done &&
+		   atomic_read(&rng->ref.refcount) == 0);
 }
 EXPORT_SYMBOL_GPL(hwrng_unregister);
 
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index c212e71ea886..7832e5008959 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -46,6 +46,7 @@ struct hwrng {
 	/* internal. */
 	struct list_head list;
 	struct kref ref;
+	bool cleanup_done;
 };
 
 /** Register a new Hardware Random Number Generator driver. */
-- 
cgit v1.2.3


From 556d2f055bf6d79ce81587dfe774d4dd10da473f Mon Sep 17 00:00:00 2001
From: Yalin Wang <Yalin.Wang@sonymobile.com>
Date: Mon, 3 Nov 2014 03:01:03 +0100
Subject: ARM: 8187/1: add CONFIG_HAVE_ARCH_BITREVERSE to support rbit
 instruction

this change add CONFIG_HAVE_ARCH_BITREVERSE config option,
so that we can use some architecture's bitrev hardware instruction
to do bitrev operation.

Introduce __constant_bitrev* macro for constant bitrev operation.

Change __bitrev16() __bitrev32() to be inline function,
don't need export symbol for these tiny functions.

Signed-off-by: Yalin Wang <yalin.wang@sonymobile.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/bitrev.h | 77 +++++++++++++++++++++++++++++++++++++++++++++++---
 lib/Kconfig            |  9 ++++++
 lib/bitrev.c           | 17 ++---------
 3 files changed, 84 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitrev.h b/include/linux/bitrev.h
index 7ffe03f4693d..fb790b8449c1 100644
--- a/include/linux/bitrev.h
+++ b/include/linux/bitrev.h
@@ -3,14 +3,83 @@
 
 #include <linux/types.h>
 
-extern u8 const byte_rev_table[256];
+#ifdef CONFIG_HAVE_ARCH_BITREVERSE
+#include <asm/bitrev.h>
+
+#define __bitrev32 __arch_bitrev32
+#define __bitrev16 __arch_bitrev16
+#define __bitrev8 __arch_bitrev8
 
-static inline u8 bitrev8(u8 byte)
+#else
+extern u8 const byte_rev_table[256];
+static inline u8 __bitrev8(u8 byte)
 {
 	return byte_rev_table[byte];
 }
 
-extern u16 bitrev16(u16 in);
-extern u32 bitrev32(u32 in);
+static inline u16 __bitrev16(u16 x)
+{
+	return (__bitrev8(x & 0xff) << 8) | __bitrev8(x >> 8);
+}
+
+static inline u32 __bitrev32(u32 x)
+{
+	return (__bitrev16(x & 0xffff) << 16) | __bitrev16(x >> 16);
+}
+
+#endif /* CONFIG_HAVE_ARCH_BITREVERSE */
+
+#define __constant_bitrev32(x)	\
+({					\
+	u32 __x = x;			\
+	__x = (__x >> 16) | (__x << 16);	\
+	__x = ((__x & (u32)0xFF00FF00UL) >> 8) | ((__x & (u32)0x00FF00FFUL) << 8);	\
+	__x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4);	\
+	__x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2);	\
+	__x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1);	\
+	__x;								\
+})
+
+#define __constant_bitrev16(x)	\
+({					\
+	u16 __x = x;			\
+	__x = (__x >> 8) | (__x << 8);	\
+	__x = ((__x & (u16)0xF0F0U) >> 4) | ((__x & (u16)0x0F0FU) << 4);	\
+	__x = ((__x & (u16)0xCCCCU) >> 2) | ((__x & (u16)0x3333U) << 2);	\
+	__x = ((__x & (u16)0xAAAAU) >> 1) | ((__x & (u16)0x5555U) << 1);	\
+	__x;								\
+})
+
+#define __constant_bitrev8(x)	\
+({					\
+	u8 __x = x;			\
+	__x = (__x >> 4) | (__x << 4);	\
+	__x = ((__x & (u8)0xCCU) >> 2) | ((__x & (u8)0x33U) << 2);	\
+	__x = ((__x & (u8)0xAAU) >> 1) | ((__x & (u8)0x55U) << 1);	\
+	__x;								\
+})
+
+#define bitrev32(x) \
+({			\
+	u32 __x = x;	\
+	__builtin_constant_p(__x) ?	\
+	__constant_bitrev32(__x) :			\
+	__bitrev32(__x);				\
+})
+
+#define bitrev16(x) \
+({			\
+	u16 __x = x;	\
+	__builtin_constant_p(__x) ?	\
+	__constant_bitrev16(__x) :			\
+	__bitrev16(__x);				\
+ })
 
+#define bitrev8(x) \
+({			\
+	u8 __x = x;	\
+	__builtin_constant_p(__x) ?	\
+	__constant_bitrev8(__x) :			\
+	__bitrev8(__x)	;			\
+ })
 #endif /* _LINUX_BITREV_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 54cf309a92a5..cd177caf3876 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -13,6 +13,15 @@ config RAID6_PQ
 config BITREVERSE
 	tristate
 
+config HAVE_ARCH_BITREVERSE
+	boolean
+	default n
+	depends on BITREVERSE
+	help
+	  This option provides an config for the architecture which have instruction
+	  can do bitreverse operation, we use the hardware instruction if the architecture
+	  have this capability.
+
 config RATIONAL
 	boolean
 
diff --git a/lib/bitrev.c b/lib/bitrev.c
index 3956203456d4..40ffda94cc5d 100644
--- a/lib/bitrev.c
+++ b/lib/bitrev.c
@@ -1,3 +1,4 @@
+#ifndef CONFIG_HAVE_ARCH_BITREVERSE
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/bitrev.h>
@@ -42,18 +43,4 @@ const u8 byte_rev_table[256] = {
 };
 EXPORT_SYMBOL_GPL(byte_rev_table);
 
-u16 bitrev16(u16 x)
-{
-	return (bitrev8(x & 0xff) << 8) | bitrev8(x >> 8);
-}
-EXPORT_SYMBOL(bitrev16);
-
-/**
- * bitrev32 - reverse the order of bits in a u32 value
- * @x: value to be bit-reversed
- */
-u32 bitrev32(u32 x)
-{
-	return (bitrev16(x & 0xffff) << 16) | bitrev16(x >> 16);
-}
-EXPORT_SYMBOL(bitrev32);
+#endif /* CONFIG_HAVE_ARCH_BITREVERSE */
-- 
cgit v1.2.3


From 77584ee57434813b50fc85cde995a6271a5081b7 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 23 Dec 2014 16:40:17 +1100
Subject: hwrng: core - Use struct completion for cleanup_done

There is no point in doing a manual completion for cleanup_done
when struct completion fits in perfectly.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/core.c | 12 +++---------
 include/linux/hw_random.h     |  3 ++-
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 6ec42252e46e..3dba2cf50241 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -60,7 +60,6 @@ static DEFINE_MUTEX(rng_mutex);
 static DEFINE_MUTEX(reading_mutex);
 static int data_avail;
 static u8 *rng_buffer, *rng_fillbuf;
-static DECLARE_WAIT_QUEUE_HEAD(rng_done);
 static unsigned short current_quality;
 static unsigned short default_quality; /* = 0; default to "off" */
 
@@ -100,10 +99,7 @@ static inline void cleanup_rng(struct kref *kref)
 	if (rng->cleanup)
 		rng->cleanup(rng);
 
-	/* cleanup_done should be updated after cleanup finishes */
-	smp_wmb();
-	rng->cleanup_done = true;
-	wake_up_all(&rng_done);
+	complete(&rng->cleanup_done);
 }
 
 static void set_current_rng(struct hwrng *rng)
@@ -498,7 +494,7 @@ int hwrng_register(struct hwrng *rng)
 		add_early_randomness(rng);
 	}
 
-	rng->cleanup_done = false;
+	init_completion(&rng->cleanup_done);
 
 out_unlock:
 	mutex_unlock(&rng_mutex);
@@ -532,9 +528,7 @@ void hwrng_unregister(struct hwrng *rng)
 	} else
 		mutex_unlock(&rng_mutex);
 
-	/* Just in case rng is reading right now, wait. */
-	wait_event(rng_done, rng->cleanup_done &&
-		   atomic_read(&rng->ref.refcount) == 0);
+	wait_for_completion(&rng->cleanup_done);
 }
 EXPORT_SYMBOL_GPL(hwrng_unregister);
 
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index 7832e5008959..eb7b414d232b 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -12,6 +12,7 @@
 #ifndef LINUX_HWRANDOM_H_
 #define LINUX_HWRANDOM_H_
 
+#include <linux/completion.h>
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/kref.h>
@@ -46,7 +47,7 @@ struct hwrng {
 	/* internal. */
 	struct list_head list;
 	struct kref ref;
-	bool cleanup_done;
+	struct completion cleanup_done;
 };
 
 /** Register a new Hardware Random Number Generator driver. */
-- 
cgit v1.2.3


From 7ab374a053a43050117eb452306b6cd9dcb58cfd Mon Sep 17 00:00:00 2001
From: Karol Wrona <k.wrona@samsung.com>
Date: Fri, 19 Dec 2014 18:39:24 +0100
Subject: iio: kfifo: Remove unused argument in iio_kfifo_allocate

indio_dev was unused in function body plus some small style fix - add new
lines after "if(sth) return sth" and before the last return statement.

The argument was removed also in its client.

Signed-off-by: Karol Wrona <k.wrona@samsung.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/adc/ti_am335x_adc.c                 | 2 +-
 drivers/iio/industrialio-triggered-buffer.c     | 2 +-
 drivers/iio/kfifo_buf.c                         | 6 ++++--
 drivers/staging/iio/accel/lis3l02dq_ring.c      | 2 +-
 drivers/staging/iio/iio_simple_dummy_buffer.c   | 2 +-
 drivers/staging/iio/impedance-analyzer/ad5933.c | 2 +-
 drivers/staging/iio/meter/ade7758_ring.c        | 2 +-
 include/linux/iio/kfifo_buf.h                   | 2 +-
 8 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/adc/ti_am335x_adc.c b/drivers/iio/adc/ti_am335x_adc.c
index d550ac7d2365..5eea299518a3 100644
--- a/drivers/iio/adc/ti_am335x_adc.c
+++ b/drivers/iio/adc/ti_am335x_adc.c
@@ -250,7 +250,7 @@ static int tiadc_iio_buffered_hardware_setup(struct iio_dev *indio_dev,
 	struct iio_buffer *buffer;
 	int ret;
 
-	buffer = iio_kfifo_allocate(indio_dev);
+	buffer = iio_kfifo_allocate();
 	if (!buffer)
 		return -ENOMEM;
 
diff --git a/drivers/iio/industrialio-triggered-buffer.c b/drivers/iio/industrialio-triggered-buffer.c
index 61a5d0404edf..15a5341b5e7b 100644
--- a/drivers/iio/industrialio-triggered-buffer.c
+++ b/drivers/iio/industrialio-triggered-buffer.c
@@ -49,7 +49,7 @@ int iio_triggered_buffer_setup(struct iio_dev *indio_dev,
 	struct iio_buffer *buffer;
 	int ret;
 
-	buffer = iio_kfifo_allocate(indio_dev);
+	buffer = iio_kfifo_allocate();
 	if (!buffer) {
 		ret = -ENOMEM;
 		goto error_ret;
diff --git a/drivers/iio/kfifo_buf.c b/drivers/iio/kfifo_buf.c
index b20a9cfbc8ed..7f6fad658e83 100644
--- a/drivers/iio/kfifo_buf.c
+++ b/drivers/iio/kfifo_buf.c
@@ -140,18 +140,20 @@ static const struct iio_buffer_access_funcs kfifo_access_funcs = {
 	.release = &iio_kfifo_buffer_release,
 };
 
-struct iio_buffer *iio_kfifo_allocate(struct iio_dev *indio_dev)
+struct iio_buffer *iio_kfifo_allocate(void)
 {
 	struct iio_kfifo *kf;
 
-	kf = kzalloc(sizeof *kf, GFP_KERNEL);
+	kf = kzalloc(sizeof(*kf), GFP_KERNEL);
 	if (!kf)
 		return NULL;
+
 	kf->update_needed = true;
 	iio_buffer_init(&kf->buffer);
 	kf->buffer.access = &kfifo_access_funcs;
 	kf->buffer.length = 2;
 	mutex_init(&kf->user_lock);
+
 	return &kf->buffer;
 }
 EXPORT_SYMBOL(iio_kfifo_allocate);
diff --git a/drivers/staging/iio/accel/lis3l02dq_ring.c b/drivers/staging/iio/accel/lis3l02dq_ring.c
index 9efc77b0ebdd..1fd90090a633 100644
--- a/drivers/staging/iio/accel/lis3l02dq_ring.c
+++ b/drivers/staging/iio/accel/lis3l02dq_ring.c
@@ -393,7 +393,7 @@ int lis3l02dq_configure_buffer(struct iio_dev *indio_dev)
 	int ret;
 	struct iio_buffer *buffer;
 
-	buffer = iio_kfifo_allocate(indio_dev);
+	buffer = iio_kfifo_allocate();
 	if (!buffer)
 		return -ENOMEM;
 
diff --git a/drivers/staging/iio/iio_simple_dummy_buffer.c b/drivers/staging/iio/iio_simple_dummy_buffer.c
index a2d72c102119..360a4c980722 100644
--- a/drivers/staging/iio/iio_simple_dummy_buffer.c
+++ b/drivers/staging/iio/iio_simple_dummy_buffer.c
@@ -121,7 +121,7 @@ int iio_simple_dummy_configure_buffer(struct iio_dev *indio_dev)
 	struct iio_buffer *buffer;
 
 	/* Allocate a buffer to use - here a kfifo */
-	buffer = iio_kfifo_allocate(indio_dev);
+	buffer = iio_kfifo_allocate();
 	if (buffer == NULL) {
 		ret = -ENOMEM;
 		goto error_ret;
diff --git a/drivers/staging/iio/impedance-analyzer/ad5933.c b/drivers/staging/iio/impedance-analyzer/ad5933.c
index c50b1380b9aa..39f60aca0838 100644
--- a/drivers/staging/iio/impedance-analyzer/ad5933.c
+++ b/drivers/staging/iio/impedance-analyzer/ad5933.c
@@ -626,7 +626,7 @@ static int ad5933_register_ring_funcs_and_init(struct iio_dev *indio_dev)
 {
 	struct iio_buffer *buffer;
 
-	buffer = iio_kfifo_allocate(indio_dev);
+	buffer = iio_kfifo_allocate();
 	if (!buffer)
 		return -ENOMEM;
 
diff --git a/drivers/staging/iio/meter/ade7758_ring.c b/drivers/staging/iio/meter/ade7758_ring.c
index 27c3ed6ca468..782fd9a3bc0d 100644
--- a/drivers/staging/iio/meter/ade7758_ring.c
+++ b/drivers/staging/iio/meter/ade7758_ring.c
@@ -119,7 +119,7 @@ int ade7758_configure_ring(struct iio_dev *indio_dev)
 	struct iio_buffer *buffer;
 	int ret = 0;
 
-	buffer = iio_kfifo_allocate(indio_dev);
+	buffer = iio_kfifo_allocate();
 	if (!buffer) {
 		ret = -ENOMEM;
 		return ret;
diff --git a/include/linux/iio/kfifo_buf.h b/include/linux/iio/kfifo_buf.h
index 25eeac762e84..1a8d57a41738 100644
--- a/include/linux/iio/kfifo_buf.h
+++ b/include/linux/iio/kfifo_buf.h
@@ -5,7 +5,7 @@
 #include <linux/iio/iio.h>
 #include <linux/iio/buffer.h>
 
-struct iio_buffer *iio_kfifo_allocate(struct iio_dev *indio_dev);
+struct iio_buffer *iio_kfifo_allocate(void);
 void iio_kfifo_free(struct iio_buffer *r);
 
 #endif
-- 
cgit v1.2.3


From 780103fef5c88a97fb9c8d0079bf326ed6147f1f Mon Sep 17 00:00:00 2001
From: Karol Wrona <k.wrona@samsung.com>
Date: Fri, 19 Dec 2014 18:39:25 +0100
Subject: iio: kfifo: Add resource management devm_iio_kfifo_allocate/free

iio kfifo allocate/free gained their devm_ wrappers.

Signed-off-by: Karol Wrona <k.wrona@samsung.com>
Suggested-by: Jonathan Cameron <jic23@kernel.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/driver-model/devres.txt |  2 ++
 drivers/iio/kfifo_buf.c               | 54 +++++++++++++++++++++++++++++++++++
 include/linux/iio/kfifo_buf.h         |  3 ++
 3 files changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index b5ab416cd53a..6d1e8eeb5990 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -258,6 +258,8 @@ IIO
   devm_iio_device_free()
   devm_iio_device_register()
   devm_iio_device_unregister()
+  devm_iio_kfifo_allocate()
+  devm_iio_kfifo_free()
   devm_iio_trigger_alloc()
   devm_iio_trigger_free()
 
diff --git a/drivers/iio/kfifo_buf.c b/drivers/iio/kfifo_buf.c
index 7f6fad658e83..b2beea01c49b 100644
--- a/drivers/iio/kfifo_buf.c
+++ b/drivers/iio/kfifo_buf.c
@@ -164,4 +164,58 @@ void iio_kfifo_free(struct iio_buffer *r)
 }
 EXPORT_SYMBOL(iio_kfifo_free);
 
+static void devm_iio_kfifo_release(struct device *dev, void *res)
+{
+	iio_kfifo_free(*(struct iio_buffer **)res);
+}
+
+static int devm_iio_kfifo_match(struct device *dev, void *res, void *data)
+{
+	struct iio_buffer **r = res;
+
+	if (WARN_ON(!r || !*r))
+		return 0;
+
+	return *r == data;
+}
+
+/**
+ * devm_iio_fifo_allocate - Resource-managed iio_kfifo_allocate()
+ * @dev:		Device to allocate kfifo buffer for
+ *
+ * RETURNS:
+ * Pointer to allocated iio_buffer on success, NULL on failure.
+ */
+struct iio_buffer *devm_iio_kfifo_allocate(struct device *dev)
+{
+	struct iio_buffer **ptr, *r;
+
+	ptr = devres_alloc(devm_iio_kfifo_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return NULL;
+
+	r = iio_kfifo_allocate();
+	if (r) {
+		*ptr = r;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return r;
+}
+EXPORT_SYMBOL(devm_iio_kfifo_allocate);
+
+/**
+ * devm_iio_fifo_free - Resource-managed iio_kfifo_free()
+ * @dev:		Device the buffer belongs to
+ * @r:			The buffer associated with the device
+ */
+void devm_iio_kfifo_free(struct device *dev, struct iio_buffer *r)
+{
+	WARN_ON(devres_release(dev, devm_iio_kfifo_release,
+			       devm_iio_kfifo_match, r));
+}
+EXPORT_SYMBOL(devm_iio_kfifo_free);
+
 MODULE_LICENSE("GPL");
diff --git a/include/linux/iio/kfifo_buf.h b/include/linux/iio/kfifo_buf.h
index 1a8d57a41738..1683bc710d14 100644
--- a/include/linux/iio/kfifo_buf.h
+++ b/include/linux/iio/kfifo_buf.h
@@ -8,4 +8,7 @@
 struct iio_buffer *iio_kfifo_allocate(void);
 void iio_kfifo_free(struct iio_buffer *r);
 
+struct iio_buffer *devm_iio_kfifo_allocate(struct device *dev);
+void devm_iio_kfifo_free(struct device *dev, struct iio_buffer *r);
+
 #endif
-- 
cgit v1.2.3


From c761d96b079e99d106fa4064e730ef7d0f312f9d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 2 Jan 2015 15:05:12 -0700
Subject: blk-mq: export blk_mq_freeze_queue()

Commit b4c6a028774b exported the start and unfreeze, but we need
the regular blk_mq_freeze_queue() for the loop conversion.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 1 +
 include/linux/blk-mq.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1a41d7aefbd5..a7d4a988516f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -136,6 +136,7 @@ void blk_mq_freeze_queue(struct request_queue *q)
 	blk_mq_freeze_queue_start(q);
 	blk_mq_freeze_queue_wait(q);
 }
+EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 3b43f509432c..5b6500c77ed2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -212,6 +212,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv);
+void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
 void blk_mq_freeze_queue_start(struct request_queue *q);
 
-- 
cgit v1.2.3


From 15acabfd02e35e270360fbe0def898e48754b3d6 Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 5 Jan 2015 12:21:45 +0100
Subject: crypto: aead - add check for presence of auth tag

The AEAD decryption operation requires the authentication tag to be
present as part of the cipher text buffer. The added check verifies that
the caller provides a cipher text with at least the authentication tag.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/crypto.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 9c8776d0ada8..90998348e564 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -1412,6 +1412,9 @@ static inline int crypto_aead_encrypt(struct aead_request *req)
  */
 static inline int crypto_aead_decrypt(struct aead_request *req)
 {
+	if (req->cryptlen < crypto_aead_authsize(crypto_aead_reqtfm(req)))
+		return -EINVAL;
+
 	return crypto_aead_crt(crypto_aead_reqtfm(req))->decrypt(req);
 }
 
-- 
cgit v1.2.3


From 5efd2ea8c9f4f12916ffc8ba636792ce052f6911 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 5 Dec 2014 15:13:54 +0100
Subject: usb: core: buffer: smallest buffer should start at ARCH_DMA_MINALIGN

the following error pops up during "testusb -a -t 10"
| musb-hdrc musb-hdrc.1.auto: dma_pool_free buffer-128,	f134e000/be842000 (bad dma)
hcd_buffer_create() creates a few buffers, the smallest has 32 bytes of
size. ARCH_KMALLOC_MINALIGN is set to 64 bytes. This combo results in
hcd_buffer_alloc() returning memory which is 32 bytes aligned and it
might by identified by buffer_offset() as another buffer. This means the
buffer which is on a 32 byte boundary will not get freed, instead it
tries to free another buffer with the error message.

This patch fixes the issue by creating the smallest DMA buffer with the
size of ARCH_KMALLOC_MINALIGN (or 32 in case ARCH_KMALLOC_MINALIGN is
smaller). This might be 32, 64 or even 128 bytes. The next three pools
will have the size 128, 512 and 2048.
In case the smallest pool is 128 bytes then we have only three pools
instead of four (and zero the first entry in the array).
The last pool size is always 2048 bytes which is the assumed PAGE_SIZE /
2 of 4096. I doubt it makes sense to continue using PAGE_SIZE / 2 where
we would end up with 8KiB buffer in case we have 16KiB pages.
Instead I think it makes sense to have a common size(s) and extend them
if there is need to.
There is a BUILD_BUG_ON() now in case someone has a minalign of more than
128 bytes.

Cc: stable@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/buffer.c | 26 +++++++++++++++++---------
 drivers/usb/core/usb.c    |  1 +
 include/linux/usb/hcd.h   |  1 +
 3 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/buffer.c b/drivers/usb/core/buffer.c
index 684ef70dc09d..506b969ea7fd 100644
--- a/drivers/usb/core/buffer.c
+++ b/drivers/usb/core/buffer.c
@@ -22,17 +22,25 @@
  */
 
 /* FIXME tune these based on pool statistics ... */
-static const size_t	pool_max[HCD_BUFFER_POOLS] = {
-	/* platforms without dma-friendly caches might need to
-	 * prevent cacheline sharing...
-	 */
-	32,
-	128,
-	512,
-	PAGE_SIZE / 2
-	/* bigger --> allocate pages */
+static size_t pool_max[HCD_BUFFER_POOLS] = {
+	32, 128, 512, 2048,
 };
 
+void __init usb_init_pool_max(void)
+{
+	/*
+	 * The pool_max values must never be smaller than
+	 * ARCH_KMALLOC_MINALIGN.
+	 */
+	if (ARCH_KMALLOC_MINALIGN <= 32)
+		;			/* Original value is okay */
+	else if (ARCH_KMALLOC_MINALIGN <= 64)
+		pool_max[0] = 64;
+	else if (ARCH_KMALLOC_MINALIGN <= 128)
+		pool_max[0] = 0;	/* Don't use this pool */
+	else
+		BUILD_BUG();		/* We don't allow this */
+}
 
 /* SETUP primitives */
 
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 2a92b97f0144..b1fb9aef0f5b 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -1049,6 +1049,7 @@ static int __init usb_init(void)
 		pr_info("%s: USB support disabled\n", usbcore_name);
 		return 0;
 	}
+	usb_init_pool_max();
 
 	retval = usb_debugfs_init();
 	if (retval)
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 086bf13307e6..8968f616e414 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -453,6 +453,7 @@ extern const struct dev_pm_ops usb_hcd_pci_pm_ops;
 #endif /* CONFIG_PCI */
 
 /* pci-ish (pdev null is ok) buffer alloc/mapping support */
+void usb_init_pool_max(void);
 int hcd_buffer_create(struct usb_hcd *hcd);
 void hcd_buffer_destroy(struct usb_hcd *hcd);
 
-- 
cgit v1.2.3


From 314b41b16a71ee824f55e2791fcb92997672da37 Mon Sep 17 00:00:00 2001
From: Wu Liang feng <wulf@rock-chips.com>
Date: Wed, 24 Dec 2014 18:22:19 +0800
Subject: USB: ehci-platform: Support ehci reset after resume quirk

The Rockchip rk3288 EHCI controller doesn't properly detect
the case when a device is removed during suspend. Specifically,
when usb resume from suspend, the EHCI controller maintaining
the USB state (FLAG_CF is 1, Current Connect Status is 1),
but a USB device (like a USB camera on rk3288) may have been
disconnected actually.

Let's add a quirk to force ehci to go into the
usb_root_hub_lost_power() path and reset after resume.
This should generally reset the whole controller and all
ports and initialize everything cleanly again, and bring
the devices back up.

As part of this, rename the "hibernation" paramter of
ehci_resume() to force_reset since hibernation is simply
another case where we can't trust the autodetected status
and need to force a reset of devices.

Signed-off-by: Wu Liang feng <wulf@rock-chips.com>
Reviewed-by: Julius Werner <jwerner@google.com>
Reviewed-by: Doug Anderson <dianders@google.com>
Reviewed-by: Tomasz Figa <tfiga@google.com>
Reviewed-by: Pawel Osciak <posciak@google.com>
Reviewed-by: Sonny Rao <sonnyrao@google.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Tested-by: Doug Anderson <dianders@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/ehci-hcd.c      | 6 +++---
 drivers/usb/host/ehci-platform.c | 6 +++++-
 drivers/usb/host/ehci.h          | 2 +-
 include/linux/usb/ehci_pdriver.h | 3 +++
 4 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 38bfeedae1d0..85e56d1abd23 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -1110,7 +1110,7 @@ int ehci_suspend(struct usb_hcd *hcd, bool do_wakeup)
 EXPORT_SYMBOL_GPL(ehci_suspend);
 
 /* Returns 0 if power was preserved, 1 if power was lost */
-int ehci_resume(struct usb_hcd *hcd, bool hibernated)
+int ehci_resume(struct usb_hcd *hcd, bool force_reset)
 {
 	struct ehci_hcd		*ehci = hcd_to_ehci(hcd);
 
@@ -1124,12 +1124,12 @@ int ehci_resume(struct usb_hcd *hcd, bool hibernated)
 		return 0;		/* Controller is dead */
 
 	/*
-	 * If CF is still set and we aren't resuming from hibernation
+	 * If CF is still set and reset isn't forced
 	 * then we maintained suspend power.
 	 * Just undo the effect of ehci_suspend().
 	 */
 	if (ehci_readl(ehci, &ehci->regs->configured_flag) == FLAG_CF &&
-			!hibernated) {
+			!force_reset) {
 		int	mask = INTR_MASK;
 
 		ehci_prepare_ports_for_controller_resume(ehci);
diff --git a/drivers/usb/host/ehci-platform.c b/drivers/usb/host/ehci-platform.c
index 8557803e6154..db5c29edf6db 100644
--- a/drivers/usb/host/ehci-platform.c
+++ b/drivers/usb/host/ehci-platform.c
@@ -185,6 +185,10 @@ static int ehci_platform_probe(struct platform_device *dev)
 		if (of_property_read_bool(dev->dev.of_node, "big-endian"))
 			ehci->big_endian_mmio = ehci->big_endian_desc = 1;
 
+		if (of_property_read_bool(dev->dev.of_node,
+					  "needs-reset-on-resume"))
+			pdata->reset_on_resume = 1;
+
 		priv->phy = devm_phy_get(&dev->dev, "usb");
 		if (IS_ERR(priv->phy)) {
 			err = PTR_ERR(priv->phy);
@@ -340,7 +344,7 @@ static int ehci_platform_resume(struct device *dev)
 			return err;
 	}
 
-	ehci_resume(hcd, false);
+	ehci_resume(hcd, pdata->reset_on_resume);
 	return 0;
 }
 #endif /* CONFIG_PM_SLEEP */
diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
index 6f0577b0a5ae..52ef0844a180 100644
--- a/drivers/usb/host/ehci.h
+++ b/drivers/usb/host/ehci.h
@@ -871,7 +871,7 @@ extern int	ehci_handshake(struct ehci_hcd *ehci, void __iomem *ptr,
 
 #ifdef CONFIG_PM
 extern int	ehci_suspend(struct usb_hcd *hcd, bool do_wakeup);
-extern int	ehci_resume(struct usb_hcd *hcd, bool hibernated);
+extern int	ehci_resume(struct usb_hcd *hcd, bool force_reset);
 #endif	/* CONFIG_PM */
 
 extern int	ehci_hub_control(struct usb_hcd	*hcd, u16 typeReq, u16 wValue,
diff --git a/include/linux/usb/ehci_pdriver.h b/include/linux/usb/ehci_pdriver.h
index 7eb4dcd0d386..6287b398abd9 100644
--- a/include/linux/usb/ehci_pdriver.h
+++ b/include/linux/usb/ehci_pdriver.h
@@ -34,6 +34,8 @@ struct usb_hcd;
  *			after initialization.
  * @no_io_watchdog:	set to 1 if the controller does not need the I/O
  *			watchdog to run.
+ * @reset_on_resume:	set to 1 if the controller needs to be reset after
+ * 			a suspend / resume cycle (but can't detect that itself).
  *
  * These are general configuration options for the EHCI controller. All of
  * these options are activating more or less workarounds for some hardware.
@@ -45,6 +47,7 @@ struct usb_ehci_pdata {
 	unsigned	big_endian_desc:1;
 	unsigned	big_endian_mmio:1;
 	unsigned	no_io_watchdog:1;
+	unsigned	reset_on_resume:1;
 
 	/* Turn on all power and clocks */
 	int (*power_on)(struct platform_device *pdev);
-- 
cgit v1.2.3


From 4bf4ea9dca4ba1984abeb592f429265b9bacac42 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Tue, 30 Dec 2014 20:28:15 -0500
Subject: serial: omap_8250: Fix RTS handling

The OMAP3 UART ignores MCR[1] (ie., UART_MCR_RTS) when in autoRTS
mode (UPF_HARD_FLOW + CRTSCTS). This makes it impossible for either
the serial core or userspace to manually flow control the sender.

Disable autoRTS mode when RTS is lowered and restore the previous
mode when RTS is raised.

Note that the OMAP3 UART provides no mechanism for switching from
autoRTS mode without corrupting incoming data; to access the
necessary register, the line control settings must be set to 8-e-2
and thus any data received during that time will be interpreted with
those settings. This corruption has been observed in practice.

Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_core.c | 12 +++++++++++-
 drivers/tty/serial/8250/8250_omap.c | 25 ++++++++++++++++++++++---
 include/linux/serial_8250.h         |  1 +
 include/linux/serial_core.h         |  1 +
 4 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 11c66856ba2f..3bfcfdb57d05 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -1924,7 +1924,7 @@ static unsigned int serial8250_get_mctrl(struct uart_port *port)
 	return ret;
 }
 
-static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
+void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	unsigned char mcr = 0;
@@ -1944,6 +1944,14 @@ static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
 
 	serial_port_out(port, UART_MCR, mcr);
 }
+EXPORT_SYMBOL_GPL(serial8250_do_set_mctrl);
+
+static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	if (port->set_mctrl)
+		return port->set_mctrl(port, mctrl);
+	return serial8250_do_set_mctrl(port, mctrl);
+}
 
 static void serial8250_break_ctl(struct uart_port *port, int break_state)
 {
@@ -3605,6 +3613,8 @@ int serial8250_register_8250_port(struct uart_8250_port *up)
 		/*  Possibly override set_termios call */
 		if (up->port.set_termios)
 			uart->port.set_termios = up->port.set_termios;
+		if (up->port.set_mctrl)
+			uart->port.set_mctrl = up->port.set_mctrl;
 		if (up->port.startup)
 			uart->port.startup = up->port.startup;
 		if (up->port.shutdown)
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 273f37c6b493..227033db214b 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -106,6 +106,27 @@ static u32 uart_read(struct uart_8250_port *up, u32 reg)
 	return readl(up->port.membase + (reg << up->port.regshift));
 }
 
+static void omap8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+	struct omap8250_priv *priv = up->port.private_data;
+	u8 lcr;
+
+	serial8250_do_set_mctrl(port, mctrl);
+
+	/*
+	 * Turn off autoRTS if RTS is lowered and restore autoRTS setting
+	 * if RTS is raised
+	 */
+	lcr = serial_in(up, UART_LCR);
+	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
+	if (mctrl & TIOCM_RTS)
+		serial_out(up, UART_EFR, priv->efr);
+	else
+		serial_out(up, UART_EFR, priv->efr & ~UART_EFR_RTS);
+	serial_out(up, UART_LCR, lcr);
+}
+
 /*
  * Work Around for Errata i202 (2430, 3430, 3630, 4430 and 4460)
  * The access to uart register after MDR1 Access
@@ -400,9 +421,6 @@ static void omap_8250_set_termios(struct uart_port *port,
 	if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW) {
 		/* Enable AUTORTS and AUTOCTS */
 		priv->efr |= UART_EFR_CTS | UART_EFR_RTS;
-
-		/* Ensure MCR RTS is asserted */
-		up->mcr |= UART_MCR_RTS;
 	} else	if (up->port.flags & UPF_SOFT_FLOW) {
 		/*
 		 * IXON Flag:
@@ -1007,6 +1025,7 @@ static int omap8250_probe(struct platform_device *pdev)
 	up.capabilities |= UART_CAP_RPM;
 #endif
 	up.port.set_termios = omap_8250_set_termios;
+	up.port.set_mctrl = omap8250_set_mctrl;
 	up.port.pm = omap_8250_pm;
 	up.port.startup = omap_8250_startup;
 	up.port.shutdown = omap_8250_shutdown;
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index e02acf0a0ec9..245b959f1ff6 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -126,6 +126,7 @@ extern int serial8250_do_startup(struct uart_port *port);
 extern void serial8250_do_shutdown(struct uart_port *port);
 extern void serial8250_do_pm(struct uart_port *port, unsigned int state,
 			     unsigned int oldstate);
+extern void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl);
 extern int fsl8250_handle_irq(struct uart_port *port);
 int serial8250_handle_irq(struct uart_port *port, unsigned int iir);
 unsigned char serial8250_rx_chars(struct uart_8250_port *up, unsigned char lsr);
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 057038cf2788..a0c7033d5f91 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -123,6 +123,7 @@ struct uart_port {
 	void			(*set_termios)(struct uart_port *,
 				               struct ktermios *new,
 				               struct ktermios *old);
+	void			(*set_mctrl)(struct uart_port *, unsigned int);
 	int			(*startup)(struct uart_port *port);
 	void			(*shutdown)(struct uart_port *port);
 	void			(*throttle)(struct uart_port *port);
-- 
cgit v1.2.3


From a291b7d5f600a068af1e2186ce590a9a39093ddb Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Wed, 10 Dec 2014 12:49:24 +0100
Subject: serial: s3c: add missing register definitions

This macro definitions are necessary to implement DMA transfers
is samsung serial driver.

Based on previous work of Sylwester Nawrocki and Lukasz Czerwinski.

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_s3c.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h
index e6fc9567690b..a7f004a3c177 100644
--- a/include/linux/serial_s3c.h
+++ b/include/linux/serial_s3c.h
@@ -104,6 +104,31 @@
 				   S3C2410_UCON_RXIRQMODE | \
 				   S3C2410_UCON_RXFIFO_TOI)
 
+#define S3C64XX_UCON_TXBURST_1          (0<<20)
+#define S3C64XX_UCON_TXBURST_4          (1<<20)
+#define S3C64XX_UCON_TXBURST_8          (2<<20)
+#define S3C64XX_UCON_TXBURST_16         (3<<20)
+#define S3C64XX_UCON_TXBURST_MASK       (0xf<<20)
+#define S3C64XX_UCON_RXBURST_1          (0<<16)
+#define S3C64XX_UCON_RXBURST_4          (1<<16)
+#define S3C64XX_UCON_RXBURST_8          (2<<16)
+#define S3C64XX_UCON_RXBURST_16         (3<<16)
+#define S3C64XX_UCON_RXBURST_MASK       (0xf<<16)
+#define S3C64XX_UCON_TIMEOUT_SHIFT      (12)
+#define S3C64XX_UCON_TIMEOUT_MASK       (0xf<<12)
+#define S3C64XX_UCON_EMPTYINT_EN        (1<<11)
+#define S3C64XX_UCON_DMASUS_EN          (1<<10)
+#define S3C64XX_UCON_TXINT_LEVEL        (1<<9)
+#define S3C64XX_UCON_RXINT_LEVEL        (1<<8)
+#define S3C64XX_UCON_TIMEOUT_EN         (1<<7)
+#define S3C64XX_UCON_ERRINT_EN          (1<<6)
+#define S3C64XX_UCON_TXMODE_DMA         (2<<2)
+#define S3C64XX_UCON_TXMODE_CPU         (1<<2)
+#define S3C64XX_UCON_TXMODE_MASK        (3<<2)
+#define S3C64XX_UCON_RXMODE_DMA         (2<<0)
+#define S3C64XX_UCON_RXMODE_CPU         (1<<0)
+#define S3C64XX_UCON_RXMODE_MASK        (3<<0)
+
 #define S3C2410_UFCON_FIFOMODE	  (1<<0)
 #define S3C2410_UFCON_TXTRIG0	  (0<<6)
 #define S3C2410_UFCON_RXTRIG8	  (1<<4)
@@ -155,6 +180,7 @@
 #define S3C2440_UFSTAT_TXMASK	  (63<<8)
 #define S3C2440_UFSTAT_RXMASK	  (63)
 
+#define S3C2410_UTRSTAT_TIMEOUT   (1<<3)
 #define S3C2410_UTRSTAT_TXE	  (1<<2)
 #define S3C2410_UTRSTAT_TXFE	  (1<<1)
 #define S3C2410_UTRSTAT_RXDR	  (1<<0)
@@ -179,8 +205,10 @@
 #define S3C64XX_UINTM		0x38
 
 #define S3C64XX_UINTM_RXD	(0)
+#define S3C64XX_UINTM_ERROR     (1)
 #define S3C64XX_UINTM_TXD	(2)
 #define S3C64XX_UINTM_RXD_MSK	(1 << S3C64XX_UINTM_RXD)
+#define S3C64XX_UINTM_ERR_MSK   (1 << S3C64XX_UINTM_ERROR)
 #define S3C64XX_UINTM_TXD_MSK	(1 << S3C64XX_UINTM_TXD)
 
 /* Following are specific to S5PV210 */
-- 
cgit v1.2.3


From 09eb483e895f36fd002e88c878e9578c359aa468 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 23 Dec 2014 16:26:31 -0800
Subject: f2fs: fix missing cold bit during recovery

In do_recover_data, we find and update previous node pages after updating
its new block addresses.
After then, we call fill_node_footer without reset field, we erase its
cold bit so that this new cold node block is written to wrong log area.
This patch fixes not to miss its old flag.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.h          | 10 +++++++++-
 include/linux/f2fs_fs.h |  2 ++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index fa6f95968b42..cac8a3d9acbd 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -212,11 +212,19 @@ static inline void fill_node_footer(struct page *page, nid_t nid,
 				nid_t ino, unsigned int ofs, bool reset)
 {
 	struct f2fs_node *rn = F2FS_NODE(page);
+	unsigned int old_flag = 0;
+
 	if (reset)
 		memset(rn, 0, sizeof(*rn));
+	else
+		old_flag = le32_to_cpu(rn->footer.flag);
+
 	rn->footer.nid = cpu_to_le32(nid);
 	rn->footer.ino = cpu_to_le32(ino);
-	rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT);
+
+	/* should remain old flag bits such as COLD_BIT_SHIFT */
+	rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) |
+					(old_flag & OFFSET_BIT_MASK));
 }
 
 static inline void copy_node_footer(struct page *dst, struct page *src)
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 87f14e90e984..e993b0bc9abf 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -224,6 +224,8 @@ enum {
 	OFFSET_BIT_SHIFT
 };
 
+#define OFFSET_BIT_MASK		(0x07)	/* (0x01 << OFFSET_BIT_SHIFT) - 1 */
+
 struct node_footer {
 	__le32 nid;		/* node id */
 	__le32 ino;		/* inode nunmber */
-- 
cgit v1.2.3


From 39db74ce1aa83626a0a70ed2abf29a17598fff49 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Thu, 27 Nov 2014 14:07:28 +0200
Subject: mei: bus: use ssize_t as the return type for send and receive

Mei bus receive and send function may return either number
of transmitted bytes or errno.  It is better to use ssize_t
type for that purpose that mixing size_t with int.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus.c     | 30 ++++++++++++++----------------
 drivers/misc/mei/mei_dev.h |  6 +++---
 include/linux/mei_cl_bus.h |  4 ++--
 3 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index b3a72bca5242..31164dd14bd0 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -224,13 +224,13 @@ void mei_cl_driver_unregister(struct mei_cl_driver *driver)
 }
 EXPORT_SYMBOL_GPL(mei_cl_driver_unregister);
 
-static int ___mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length,
+static ssize_t ___mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length,
 			bool blocking)
 {
 	struct mei_device *dev;
 	struct mei_me_client *me_cl;
 	struct mei_cl_cb *cb;
-	int rets;
+	ssize_t rets;
 
 	if (WARN_ON(!cl || !cl->dev))
 		return -ENODEV;
@@ -271,12 +271,12 @@ static int ___mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length,
 	return rets;
 }
 
-int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length)
+ssize_t __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length)
 {
 	struct mei_device *dev;
 	struct mei_cl_cb *cb;
 	size_t r_length;
-	int err;
+	ssize_t rets;
 
 	if (WARN_ON(!cl || !cl->dev))
 		return -ENODEV;
@@ -286,11 +286,9 @@ int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length)
 	mutex_lock(&dev->device_lock);
 
 	if (!cl->read_cb) {
-		err = mei_cl_read_start(cl, length);
-		if (err < 0) {
-			mutex_unlock(&dev->device_lock);
-			return err;
-		}
+		rets = mei_cl_read_start(cl, length);
+		if (rets < 0)
+			goto out;
 	}
 
 	if (cl->reading_state != MEI_READ_COMPLETE &&
@@ -313,13 +311,13 @@ int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length)
 	cb = cl->read_cb;
 
 	if (cl->reading_state != MEI_READ_COMPLETE) {
-		r_length = 0;
+		rets = 0;
 		goto out;
 	}
 
 	r_length = min_t(size_t, length, cb->buf_idx);
-
 	memcpy(buf, cb->response_buffer.data, r_length);
+	rets = r_length;
 
 	mei_io_cb_free(cb);
 	cl->reading_state = MEI_IDLE;
@@ -328,20 +326,20 @@ int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length)
 out:
 	mutex_unlock(&dev->device_lock);
 
-	return r_length;
+	return rets;
 }
 
-inline int __mei_cl_async_send(struct mei_cl *cl, u8 *buf, size_t length)
+inline ssize_t __mei_cl_async_send(struct mei_cl *cl, u8 *buf, size_t length)
 {
 	return ___mei_cl_send(cl, buf, length, 0);
 }
 
-inline int __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length)
+inline ssize_t __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length)
 {
 	return ___mei_cl_send(cl, buf, length, 1);
 }
 
-int mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length)
+ssize_t mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length)
 {
 	struct mei_cl *cl = device->cl;
 
@@ -355,7 +353,7 @@ int mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length)
 }
 EXPORT_SYMBOL_GPL(mei_cl_send);
 
-int mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length)
+ssize_t mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length)
 {
 	struct mei_cl *cl =  device->cl;
 
diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index 3dad74a8d496..7f350af2ee10 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h
@@ -345,9 +345,9 @@ struct mei_cl_device *mei_cl_add_device(struct mei_device *dev,
 					struct mei_cl_ops *ops);
 void mei_cl_remove_device(struct mei_cl_device *device);
 
-int __mei_cl_async_send(struct mei_cl *cl, u8 *buf, size_t length);
-int __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length);
-int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length);
+ssize_t __mei_cl_async_send(struct mei_cl *cl, u8 *buf, size_t length);
+ssize_t __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length);
+ssize_t __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length);
 void mei_cl_bus_rx_event(struct mei_cl *cl);
 void mei_cl_bus_remove_devices(struct mei_device *dev);
 int mei_cl_bus_init(void);
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index 164aad1f9f12..0819d36a3a74 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -25,8 +25,8 @@ int __mei_cl_driver_register(struct mei_cl_driver *driver,
 
 void mei_cl_driver_unregister(struct mei_cl_driver *driver);
 
-int mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length);
-int mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length);
+ssize_t mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length);
+ssize_t  mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length);
 
 typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device,
 			       u32 events, void *context);
-- 
cgit v1.2.3


From 46d0d33350e9b32642d745a8b46a954910196b4d Mon Sep 17 00:00:00 2001
From: Gigi Joseph <gigi.joseph@gmail.com>
Date: Fri, 9 Jan 2015 03:45:02 +0000
Subject: ti-st: add device tree support

When using device tree, driver configuration data need to be read from
device node.
Add support for getting the platform data information from the device
tree information stored in the .dtb file in case it exists.

Signed-off-by: Eyal Reizer <eyalr@ti.com>
Signed-off-by: bvijay <bvijay@ti.com>
Diff rendering mode:inlineside by side

Signed-off-by: Gigi Joseph <gigi.joseph@ti.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/ti-st/st_kim.c  | 97 ++++++++++++++++++++++++++++++++++++++++----
 drivers/misc/ti-st/st_ll.c   | 17 +++++++-
 include/linux/ti_wilink_st.h |  1 +
 3 files changed, 105 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/ti-st/st_kim.c b/drivers/misc/ti-st/st_kim.c
index e4b7ee4f57b8..68a0b582d81a 100644
--- a/drivers/misc/ti-st/st_kim.c
+++ b/drivers/misc/ti-st/st_kim.c
@@ -36,7 +36,8 @@
 #include <linux/skbuff.h>
 #include <linux/ti_wilink_st.h>
 #include <linux/module.h>
-
+#include <linux/of.h>
+#include <linux/of_device.h>
 
 #define MAX_ST_DEVICES	3	/* Imagine 1 on each UART for now */
 static struct platform_device *st_kim_devices[MAX_ST_DEVICES];
@@ -44,6 +45,9 @@ static struct platform_device *st_kim_devices[MAX_ST_DEVICES];
 /**********************************************************************/
 /* internal functions */
 
+struct ti_st_plat_data	*dt_pdata;
+static struct ti_st_plat_data *get_platform_data(struct device *dev);
+
 /**
  * st_get_plat_device -
  *	function which returns the reference to the platform device
@@ -462,7 +466,12 @@ long st_kim_start(void *kim_data)
 	struct kim_data_s	*kim_gdata = (struct kim_data_s *)kim_data;
 
 	pr_info(" %s", __func__);
-	pdata = kim_gdata->kim_pdev->dev.platform_data;
+	if (kim_gdata->kim_pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else {
+		pdata = kim_gdata->kim_pdev->dev.platform_data;
+	}
 
 	do {
 		/* platform specific enabling code here */
@@ -522,12 +531,18 @@ long st_kim_stop(void *kim_data)
 {
 	long err = 0;
 	struct kim_data_s	*kim_gdata = (struct kim_data_s *)kim_data;
-	struct ti_st_plat_data	*pdata =
-		kim_gdata->kim_pdev->dev.platform_data;
+	struct ti_st_plat_data	*pdata;
 	struct tty_struct	*tty = kim_gdata->core_data->tty;
 
 	reinit_completion(&kim_gdata->ldisc_installed);
 
+	if (kim_gdata->kim_pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else
+		pdata = kim_gdata->kim_pdev->dev.platform_data;
+
+
 	if (tty) {	/* can be called before ldisc is installed */
 		/* Flush any pending characters in the driver and discipline. */
 		tty_ldisc_flush(tty);
@@ -715,13 +730,53 @@ static const struct file_operations list_debugfs_fops = {
  * board-*.c file
  */
 
+static const struct of_device_id kim_of_match[] = {
+{
+	.compatible = "kim",
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(of, kim_of_match);
+
+static struct ti_st_plat_data *get_platform_data(struct device *dev)
+{
+	struct device_node *np = dev->of_node;
+	const u32 *dt_property;
+	int len;
+
+	dt_pdata = kzalloc(sizeof(*dt_pdata), GFP_KERNEL);
+
+	if (!dt_pdata)
+		pr_err("Can't allocate device_tree platform data\n");
+
+	dt_property = of_get_property(np, "dev_name", &len);
+	if (dt_property)
+		memcpy(&dt_pdata->dev_name, dt_property, len);
+	of_property_read_u32(np, "nshutdown_gpio",
+			     (u32 *)&dt_pdata->nshutdown_gpio);
+	of_property_read_u32(np, "flow_cntrl", (u32 *)&dt_pdata->flow_cntrl);
+	of_property_read_u32(np, "baud_rate", (u32 *)&dt_pdata->baud_rate);
+
+	return dt_pdata;
+}
+
 static struct dentry *kim_debugfs_dir;
 static int kim_probe(struct platform_device *pdev)
 {
 	struct kim_data_s	*kim_gdata;
-	struct ti_st_plat_data	*pdata = pdev->dev.platform_data;
+	struct ti_st_plat_data	*pdata;
 	int err;
 
+	if (pdev->dev.of_node)
+		pdata = get_platform_data(&pdev->dev);
+	else
+		pdata = pdev->dev.platform_data;
+
+	if (pdata == NULL) {
+		dev_err(&pdev->dev, "Platform Data is missing\n");
+		return -ENXIO;
+	}
+
 	if ((pdev->id != -1) && (pdev->id < MAX_ST_DEVICES)) {
 		/* multiple devices could exist */
 		st_kim_devices[pdev->id] = pdev;
@@ -806,9 +861,16 @@ err_core_init:
 static int kim_remove(struct platform_device *pdev)
 {
 	/* free the GPIOs requested */
-	struct ti_st_plat_data	*pdata = pdev->dev.platform_data;
+	struct ti_st_plat_data	*pdata;
 	struct kim_data_s	*kim_gdata;
 
+	if (pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else {
+		pdata = pdev->dev.platform_data;
+	}
+
 	kim_gdata = platform_get_drvdata(pdev);
 
 	/* Free the Bluetooth/FM/GPIO
@@ -826,12 +888,22 @@ static int kim_remove(struct platform_device *pdev)
 
 	kfree(kim_gdata);
 	kim_gdata = NULL;
+	kfree(dt_pdata);
+	dt_pdata = NULL;
+
 	return 0;
 }
 
 static int kim_suspend(struct platform_device *pdev, pm_message_t state)
 {
-	struct ti_st_plat_data	*pdata = pdev->dev.platform_data;
+	struct ti_st_plat_data	*pdata;
+
+	if (pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else {
+		pdata = pdev->dev.platform_data;
+	}
 
 	if (pdata->suspend)
 		return pdata->suspend(pdev, state);
@@ -841,7 +913,14 @@ static int kim_suspend(struct platform_device *pdev, pm_message_t state)
 
 static int kim_resume(struct platform_device *pdev)
 {
-	struct ti_st_plat_data	*pdata = pdev->dev.platform_data;
+	struct ti_st_plat_data	*pdata;
+
+	if (pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else {
+		pdata = pdev->dev.platform_data;
+	}
 
 	if (pdata->resume)
 		return pdata->resume(pdev);
@@ -858,6 +937,8 @@ static struct platform_driver kim_platform_driver = {
 	.resume = kim_resume,
 	.driver = {
 		.name = "kim",
+		.owner = THIS_MODULE,
+		.of_match_table = of_match_ptr(kim_of_match),
 	},
 };
 
diff --git a/drivers/misc/ti-st/st_ll.c b/drivers/misc/ti-st/st_ll.c
index 93b4d67cc4a3..518e1b7f2f95 100644
--- a/drivers/misc/ti-st/st_ll.c
+++ b/drivers/misc/ti-st/st_ll.c
@@ -26,6 +26,7 @@
 #include <linux/ti_wilink_st.h>
 
 /**********************************************************************/
+
 /* internal functions */
 static void send_ll_cmd(struct st_data_s *st_data,
 	unsigned char cmd)
@@ -53,7 +54,13 @@ static void ll_device_want_to_sleep(struct st_data_s *st_data)
 
 	/* communicate to platform about chip asleep */
 	kim_data = st_data->kim_data;
-	pdata = kim_data->kim_pdev->dev.platform_data;
+	if (kim_data->kim_pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else {
+		pdata = kim_data->kim_pdev->dev.platform_data;
+	}
+
 	if (pdata->chip_asleep)
 		pdata->chip_asleep(NULL);
 }
@@ -86,7 +93,13 @@ static void ll_device_want_to_wakeup(struct st_data_s *st_data)
 
 	/* communicate to platform about chip wakeup */
 	kim_data = st_data->kim_data;
-	pdata = kim_data->kim_pdev->dev.platform_data;
+	if (kim_data->kim_pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else {
+		pdata = kim_data->kim_pdev->dev.platform_data;
+	}
+
 	if (pdata->chip_awake)
 		pdata->chip_awake(NULL);
 }
diff --git a/include/linux/ti_wilink_st.h b/include/linux/ti_wilink_st.h
index 884d6263e962..9072d9f95cff 100644
--- a/include/linux/ti_wilink_st.h
+++ b/include/linux/ti_wilink_st.h
@@ -86,6 +86,7 @@ struct st_proto_s {
 extern long st_register(struct st_proto_s *);
 extern long st_unregister(struct st_proto_s *);
 
+extern struct ti_st_plat_data   *dt_pdata;
 
 /*
  * header information used by st_core.c
-- 
cgit v1.2.3


From f379984f849d729bd2eb076b633200b1c040611e Mon Sep 17 00:00:00 2001
From: Xia Kaixu <xiakaixu@huawei.com>
Date: Fri, 9 Jan 2015 16:57:18 -0700
Subject: coresight: remove the unused macro CORESIGHT_DEBUGFS_ENTRY

Debugfs isn't used for coresight configuration, so the macro
CORESIGHT_DEBUGFS_ENTRY is unnecessary, just remove it.

Signed-off-by: Xia Kaixu <xiakaixu@huawei.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/coresight.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 5d3c54311f7a..7cbfecbfa643 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -179,15 +179,6 @@ struct coresight_device {
 #define sink_ops(csdev)		csdev->ops->sink_ops
 #define link_ops(csdev)		csdev->ops->link_ops
 
-#define CORESIGHT_DEBUGFS_ENTRY(__name, __entry_name,			\
-				 __mode, __get, __set, __fmt)		\
-DEFINE_SIMPLE_ATTRIBUTE(__name ## _ops, __get, __set, __fmt);		\
-static const struct coresight_ops_entry __name ## _entry = {		\
-	.name = __entry_name,						\
-	.mode = __mode,							\
-	.ops  = &__name ## _ops						\
-}
-
 /**
  * struct coresight_ops_sink - basic operations for a sink
  * Operations available for sinks
-- 
cgit v1.2.3


From c61c4b5dd2c6b5dbf0f7e299db1e8411ef590f5c Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Fri, 9 Jan 2015 16:57:20 -0700
Subject: coresight: Fixing wrong #ifdef/#endif placement

Fixing problem reported by:
        https://lkml.org/lkml/2015/1/6/86

The #ifdef/#endif is wrong and prevents the stub of function
of_get_coresight_platform_data() from being visible when
CONFIG_OF is not defined.

Moving CONFIG_OF condition out of CONFIG_CORESIGHT, making
them both independent.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/coresight.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 7cbfecbfa643..cd6d4c384ca3 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -230,10 +230,6 @@ extern void coresight_disable(struct coresight_device *csdev);
 extern int coresight_is_bit_set(u32 val, int position, int value);
 extern int coresight_timeout(void __iomem *addr, u32 offset,
 			     int position, int value);
-#ifdef CONFIG_OF
-extern struct coresight_platform_data *of_get_coresight_platform_data(
-				struct device *dev, struct device_node *node);
-#endif
 #else
 static inline struct coresight_device *
 coresight_register(struct coresight_desc *desc) { return NULL; }
@@ -245,10 +241,14 @@ static inline int coresight_is_bit_set(u32 val, int position, int value)
 					 { return 0; }
 static inline int coresight_timeout(void __iomem *addr, u32 offset,
 				     int position, int value) { return 1; }
+#endif
+
 #ifdef CONFIG_OF
+extern struct coresight_platform_data *of_get_coresight_platform_data(
+				struct device *dev, struct device_node *node);
+#else
 static inline struct coresight_platform_data *of_get_coresight_platform_data(
 	struct device *dev, struct device_node *node) { return NULL; }
 #endif
-#endif
 
 #endif
-- 
cgit v1.2.3


From 07673204777abe893092cc6e42ea04e5c0ac8cb7 Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@freescale.com>
Date: Tue, 30 Dec 2014 14:02:28 +0800
Subject: usb: phy: change some comments

- Delete the OTG stuffs
- .set_suspend is for controller, not for A-device or B-device.

Signed-off-by: Peter Chen <peter.chen@freescale.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/usb/phy.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index f499c23e6342..bc91b5d380fd 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h
@@ -1,5 +1,5 @@
-/* USB OTG (On The Go) defines */
 /*
+ * USB PHY defines
  *
  * These APIs may be used between USB controllers.  USB device drivers
  * (for either host or peripheral roles) don't use these calls; they
@@ -106,7 +106,7 @@ struct usb_phy {
 	int	(*set_power)(struct usb_phy *x,
 				unsigned mA);
 
-	/* for non-OTG B devices: set transceiver into suspend mode */
+	/* Set transceiver into suspend mode */
 	int	(*set_suspend)(struct usb_phy *x,
 				int suspend);
 
-- 
cgit v1.2.3


From 7acc9973e3c42de9926b28eec8ae3434dfdde3be Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Sat, 6 Dec 2014 22:05:15 +0100
Subject: usb: phy: generic: add vbus support

Add support for vbus detection and power supply. This code is more or
less stolen from phy-gpio-vbus-usb.c, and aims at providing a detection
mechanism for VBus (ie. usb cable plug) based on a GPIO line, and a
power supply activation which draws current from the VBus.

[ balbi@ti.com : fix build break ]

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/phy/phy-generic.c       | 91 ++++++++++++++++++++++++++++++++++++-
 drivers/usb/phy/phy-generic.h       |  6 +++
 include/linux/usb/usb_phy_generic.h |  2 +
 3 files changed, 98 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/phy/phy-generic.c b/drivers/usb/phy/phy-generic.c
index d53928f3a6ea..dd05254241fb 100644
--- a/drivers/usb/phy/phy-generic.c
+++ b/drivers/usb/phy/phy-generic.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
+#include <linux/usb/gadget.h>
 #include <linux/usb/otg.h>
 #include <linux/usb/usb_phy_generic.h>
 #include <linux/slab.h>
@@ -39,6 +40,9 @@
 
 #include "phy-generic.h"
 
+#define VBUS_IRQ_FLAGS \
+	(IRQF_SHARED | IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING)
+
 struct platform_device *usb_phy_generic_register(void)
 {
 	return platform_device_register_simple("usb_phy_generic",
@@ -66,6 +70,73 @@ static void nop_reset_set(struct usb_phy_generic *nop, int asserted)
 		usleep_range(10000, 20000);
 }
 
+/* interface to regulator framework */
+static void nop_set_vbus_draw(struct usb_phy_generic *nop, unsigned mA)
+{
+	struct regulator *vbus_draw = nop->vbus_draw;
+	int enabled;
+	int ret;
+
+	if (!vbus_draw)
+		return;
+
+	enabled = nop->vbus_draw_enabled;
+	if (mA) {
+		regulator_set_current_limit(vbus_draw, 0, 1000 * mA);
+		if (!enabled) {
+			ret = regulator_enable(vbus_draw);
+			if (ret < 0)
+				return;
+			nop->vbus_draw_enabled = 1;
+		}
+	} else {
+		if (enabled) {
+			ret = regulator_disable(vbus_draw);
+			if (ret < 0)
+				return;
+			nop->vbus_draw_enabled = 0;
+		}
+	}
+	nop->mA = mA;
+}
+
+
+static irqreturn_t nop_gpio_vbus_thread(int irq, void *data)
+{
+	struct usb_phy_generic *nop = data;
+	struct usb_otg *otg = nop->phy.otg;
+	int vbus, status;
+
+	vbus = gpiod_get_value(nop->gpiod_vbus);
+	if ((vbus ^ nop->vbus) == 0)
+		return IRQ_HANDLED;
+	nop->vbus = vbus;
+
+	if (vbus) {
+		status = USB_EVENT_VBUS;
+		otg->state = OTG_STATE_B_PERIPHERAL;
+		nop->phy.last_event = status;
+		usb_gadget_vbus_connect(otg->gadget);
+
+		/* drawing a "unit load" is *always* OK, except for OTG */
+		nop_set_vbus_draw(nop, 100);
+
+		atomic_notifier_call_chain(&nop->phy.notifier, status,
+					   otg->gadget);
+	} else {
+		nop_set_vbus_draw(nop, 0);
+
+		usb_gadget_vbus_disconnect(otg->gadget);
+		status = USB_EVENT_NONE;
+		otg->state = OTG_STATE_B_IDLE;
+		nop->phy.last_event = status;
+
+		atomic_notifier_call_chain(&nop->phy.notifier, status,
+					   otg->gadget);
+	}
+	return IRQ_HANDLED;
+}
+
 int usb_gen_phy_init(struct usb_phy *phy)
 {
 	struct usb_phy_generic *nop = dev_get_drvdata(phy->dev);
@@ -149,17 +220,23 @@ int usb_phy_gen_create_phy(struct device *dev, struct usb_phy_generic *nop,
 		needs_vcc = of_property_read_bool(node, "vcc-supply");
 		nop->gpiod_reset = devm_gpiod_get(dev, "reset-gpios");
 		err = PTR_ERR(nop->gpiod_reset);
+		if (!err) {
+			nop->gpiod_vbus = devm_gpiod_get(dev,
+							 "vbus-detect-gpio");
+			err = PTR_ERR(nop->gpiod_vbus);
+		}
 	} else if (pdata) {
 		type = pdata->type;
 		clk_rate = pdata->clk_rate;
 		needs_vcc = pdata->needs_vcc;
-		if (gpio_is_valid(gpio->gpio_reset)) {
+		if (gpio_is_valid(pdata->gpio_reset)) {
 			err = devm_gpio_request_one(dev, pdata->gpio_reset, 0,
 						    dev_name(dev));
 			if (!err)
 				nop->gpiod_reset =
 					gpio_to_desc(pdata->gpio_reset);
 		}
+		nop->gpiod_vbus = pdata->gpiod_vbus;
 	}
 
 	if (err == -EPROBE_DEFER)
@@ -224,6 +301,18 @@ static int usb_phy_generic_probe(struct platform_device *pdev)
 	err = usb_phy_gen_create_phy(dev, nop, dev_get_platdata(&pdev->dev));
 	if (err)
 		return err;
+	if (nop->gpiod_vbus) {
+		err = devm_request_threaded_irq(&pdev->dev,
+						gpiod_to_irq(nop->gpiod_vbus),
+						NULL, nop_gpio_vbus_thread,
+						VBUS_IRQ_FLAGS, "vbus_detect",
+						nop);
+		if (err) {
+			dev_err(&pdev->dev, "can't request irq %i, err: %d\n",
+				gpiod_to_irq(nop->gpiod_vbus), err);
+			return err;
+		}
+	}
 
 	nop->phy.init		= usb_gen_phy_init;
 	nop->phy.shutdown	= usb_gen_phy_shutdown;
diff --git a/drivers/usb/phy/phy-generic.h b/drivers/usb/phy/phy-generic.h
index 09924fdaaabe..0d0eadd54ed9 100644
--- a/drivers/usb/phy/phy-generic.h
+++ b/drivers/usb/phy/phy-generic.h
@@ -3,6 +3,7 @@
 
 #include <linux/usb/usb_phy_generic.h>
 #include <linux/gpio/consumer.h>
+#include <linux/regulator/consumer.h>
 
 struct usb_phy_generic {
 	struct usb_phy phy;
@@ -10,6 +11,11 @@ struct usb_phy_generic {
 	struct clk *clk;
 	struct regulator *vcc;
 	struct gpio_desc *gpiod_reset;
+	struct gpio_desc *gpiod_vbus;
+	struct regulator *vbus_draw;
+	bool vbus_draw_enabled;
+	unsigned long mA;
+	unsigned int vbus;
 };
 
 int usb_gen_phy_init(struct usb_phy *phy);
diff --git a/include/linux/usb/usb_phy_generic.h b/include/linux/usb/usb_phy_generic.h
index 68adae83affc..c13632d5292e 100644
--- a/include/linux/usb/usb_phy_generic.h
+++ b/include/linux/usb/usb_phy_generic.h
@@ -2,6 +2,7 @@
 #define __LINUX_USB_NOP_XCEIV_H
 
 #include <linux/usb/otg.h>
+#include <linux/gpio/consumer.h>
 
 struct usb_phy_generic_platform_data {
 	enum usb_phy_type type;
@@ -11,6 +12,7 @@ struct usb_phy_generic_platform_data {
 	unsigned int needs_vcc:1;
 	unsigned int needs_reset:1;	/* deprecated */
 	int gpio_reset;
+	struct gpio_desc *gpiod_vbus;
 };
 
 #if IS_ENABLED(CONFIG_NOP_USB_XCEIV)
-- 
cgit v1.2.3


From cbf6ab52add20b845f903decc973afbd5463c527 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Mon, 5 Jan 2015 19:29:32 +0800
Subject: kprobes: Pass the original kprobe for preparing optimized kprobe

Pass the original kprobe for preparing an optimized kprobe arch-dep
part, since for some architecture (e.g. ARM32) requires the information
in original kprobe.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Jon Medhurst <tixy@linaro.org>
---
 arch/x86/kernel/kprobes/opt.c | 3 ++-
 include/linux/kprobes.h       | 3 ++-
 kernel/kprobes.c              | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7c523bbf3dc8..0dd8d089c315 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -322,7 +322,8 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
  * Target instructions MUST be relocatable (checked inside)
  * This is called when new aggr(opt)probe is allocated or reused.
  */
-int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+				  struct kprobe *__unused)
 {
 	u8 *buf;
 	int ret;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 5297f9fa0ef2..1ab54754a86d 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -308,7 +308,8 @@ struct optimized_kprobe {
 /* Architecture dependent functions for direct jump optimization */
 extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn);
 extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
-extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op);
+extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+					 struct kprobe *orig);
 extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
 extern void arch_optimize_kprobes(struct list_head *oplist);
 extern void arch_unoptimize_kprobes(struct list_head *oplist,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 06f58309fed2..bad4e959f2f7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
 	struct optimized_kprobe *op;
 
 	op = container_of(p, struct optimized_kprobe, kp);
-	arch_prepare_optimized_kprobe(op);
+	arch_prepare_optimized_kprobe(op, p);
 }
 
 /* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 
 	INIT_LIST_HEAD(&op->list);
 	op->kp.addr = p->addr;
-	arch_prepare_optimized_kprobe(op);
+	arch_prepare_optimized_kprobe(op, p);
 
 	return &op->kp;
 }
-- 
cgit v1.2.3


From dd22f551ac0ad366f92f601835f6623b83adc331 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 7 Jan 2015 18:05:34 +0200
Subject: block: Change direct_access calling convention

In order to support accesses to larger chunks of memory, pass in a
'size' parameter (counted in bytes), and return the amount available at
that address.

Add a new helper function, bdev_direct_access(), to handle common
functionality including partition handling, checking the length requested
is positive, checking for the sector being page-aligned, and checking
the length of the request does not pass the end of the partition.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Boaz Harrosh <boaz@plexistor.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/filesystems/xip.txt | 15 +++++++++------
 arch/powerpc/sysdev/axonram.c     | 17 ++++-------------
 drivers/block/brd.c               | 14 +++++++-------
 drivers/s390/block/dcssblk.c      | 21 +++++++++-----------
 fs/block_dev.c                    | 40 +++++++++++++++++++++++++++++++++++++++
 fs/ext2/xip.c                     | 31 +++++++++++++-----------------
 include/linux/blkdev.h            |  6 ++++--
 7 files changed, 86 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/xip.txt b/Documentation/filesystems/xip.txt
index 0466ee569278..b77472949ede 100644
--- a/Documentation/filesystems/xip.txt
+++ b/Documentation/filesystems/xip.txt
@@ -28,12 +28,15 @@ Implementation
 Execute-in-place is implemented in three steps: block device operation,
 address space operation, and file operations.
 
-A block device operation named direct_access is used to retrieve a
-reference (pointer) to a block on-disk. The reference is supposed to be
-cpu-addressable, physical address and remain valid until the release operation
-is performed. A struct block_device reference is used to address the device,
-and a sector_t argument is used to identify the individual block. As an
-alternative, memory technology devices can be used for this.
+A block device operation named direct_access is used to translate the
+block device sector number to a page frame number (pfn) that identifies
+the physical page for the memory.  It also returns a kernel virtual
+address that can be used to access the memory.
+
+The direct_access method takes a 'size' parameter that indicates the
+number of bytes being requested.  The function should return the number
+of bytes that can be contiguously accessed at that offset.  It may also
+return a negative errno if an error occurs.
 
 The block device operation is optional, these block devices support it as of
 today:
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index f532c92bf99d..20f8afe855d1 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -139,26 +139,17 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
  * axon_ram_direct_access - direct_access() method for block device
  * @device, @sector, @data: see block_device_operations method
  */
-static int
+static long
 axon_ram_direct_access(struct block_device *device, sector_t sector,
-		       void **kaddr, unsigned long *pfn)
+		       void **kaddr, unsigned long *pfn, long size)
 {
 	struct axon_ram_bank *bank = device->bd_disk->private_data;
-	loff_t offset;
-
-	offset = sector;
-	if (device->bd_part != NULL)
-		offset += device->bd_part->start_sect;
-	offset <<= AXON_RAM_SECTOR_SHIFT;
-	if (offset >= bank->size) {
-		dev_err(&bank->device->dev, "Access outside of address space\n");
-		return -ERANGE;
-	}
+	loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
 
 	*kaddr = (void *)(bank->ph_addr + offset);
 	*pfn = virt_to_phys(kaddr) >> PAGE_SHIFT;
 
-	return 0;
+	return bank->size - offset;
 }
 
 static const struct block_device_operations axon_ram_devops = {
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 3598110d2cef..89e90ec52f28 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -370,25 +370,25 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 }
 
 #ifdef CONFIG_BLK_DEV_XIP
-static int brd_direct_access(struct block_device *bdev, sector_t sector,
-			void **kaddr, unsigned long *pfn)
+static long brd_direct_access(struct block_device *bdev, sector_t sector,
+			void **kaddr, unsigned long *pfn, long size)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	struct page *page;
 
 	if (!brd)
 		return -ENODEV;
-	if (sector & (PAGE_SECTORS-1))
-		return -EINVAL;
-	if (sector + PAGE_SECTORS > get_capacity(bdev->bd_disk))
-		return -ERANGE;
 	page = brd_insert_page(brd, sector);
 	if (!page)
 		return -ENOSPC;
 	*kaddr = page_address(page);
 	*pfn = page_to_pfn(page);
 
-	return 0;
+	/*
+	 * TODO: If size > PAGE_SIZE, we could look to see if the next page in
+	 * the file happens to be mapped to the next page of physical RAM.
+	 */
+	return PAGE_SIZE;
 }
 #endif
 
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index b550c8c8d010..31d6884f3351 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -28,8 +28,8 @@
 static int dcssblk_open(struct block_device *bdev, fmode_t mode);
 static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static void dcssblk_make_request(struct request_queue *q, struct bio *bio);
-static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
-				 void **kaddr, unsigned long *pfn);
+static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
+				 void **kaddr, unsigned long *pfn, long size);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
@@ -866,25 +866,22 @@ fail:
 	bio_io_error(bio);
 }
 
-static int
+static long
 dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
-			void **kaddr, unsigned long *pfn)
+			void **kaddr, unsigned long *pfn, long size)
 {
 	struct dcssblk_dev_info *dev_info;
-	unsigned long pgoff;
+	unsigned long offset, dev_sz;
 
 	dev_info = bdev->bd_disk->private_data;
 	if (!dev_info)
 		return -ENODEV;
-	if (secnum % (PAGE_SIZE/512))
-		return -EINVAL;
-	pgoff = secnum / (PAGE_SIZE / 512);
-	if ((pgoff+1)*PAGE_SIZE-1 > dev_info->end - dev_info->start)
-		return -ERANGE;
-	*kaddr = (void *) (dev_info->start+pgoff*PAGE_SIZE);
+	dev_sz = dev_info->end - dev_info->start;
+	offset = secnum * 512;
+	*kaddr = (void *) (dev_info->start + offset);
 	*pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT;
 
-	return 0;
+	return dev_sz - offset;
 }
 
 static void
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b48c41bf0f86..f314c2c0567d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -429,6 +429,46 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(bdev_write_page);
 
+/**
+ * bdev_direct_access() - Get the address for directly-accessibly memory
+ * @bdev: The device containing the memory
+ * @sector: The offset within the device
+ * @addr: Where to put the address of the memory
+ * @pfn: The Page Frame Number for the memory
+ * @size: The number of bytes requested
+ *
+ * If a block device is made up of directly addressable memory, this function
+ * will tell the caller the PFN and the address of the memory.  The address
+ * may be directly dereferenced within the kernel without the need to call
+ * ioremap(), kmap() or similar.  The PFN is suitable for inserting into
+ * page tables.
+ *
+ * Return: negative errno if an error occurs, otherwise the number of bytes
+ * accessible at this address.
+ */
+long bdev_direct_access(struct block_device *bdev, sector_t sector,
+			void **addr, unsigned long *pfn, long size)
+{
+	long avail;
+	const struct block_device_operations *ops = bdev->bd_disk->fops;
+
+	if (size < 0)
+		return size;
+	if (!ops->direct_access)
+		return -EOPNOTSUPP;
+	if ((sector + DIV_ROUND_UP(size, 512)) >
+					part_nr_sects_read(bdev->bd_part))
+		return -ERANGE;
+	sector += get_start_sect(bdev);
+	if (sector % (PAGE_SIZE / 512))
+		return -EINVAL;
+	avail = ops->direct_access(bdev, sector, addr, pfn, size);
+	if (!avail)
+		return -ERANGE;
+	return min(avail, size);
+}
+EXPORT_SYMBOL_GPL(bdev_direct_access);
+
 /*
  * pseudo-fs
  */
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index e98171a11cfe..bbc5fec6ff7f 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -13,18 +13,12 @@
 #include "ext2.h"
 #include "xip.h"
 
-static inline int
-__inode_direct_access(struct inode *inode, sector_t block,
-		      void **kaddr, unsigned long *pfn)
+static inline long __inode_direct_access(struct inode *inode, sector_t block,
+				void **kaddr, unsigned long *pfn, long size)
 {
 	struct block_device *bdev = inode->i_sb->s_bdev;
-	const struct block_device_operations *ops = bdev->bd_disk->fops;
-	sector_t sector;
-
-	sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
-
-	BUG_ON(!ops->direct_access);
-	return ops->direct_access(bdev, sector, kaddr, pfn);
+	sector_t sector = block * (PAGE_SIZE / 512);
+	return bdev_direct_access(bdev, sector, kaddr, pfn, size);
 }
 
 static inline int
@@ -53,12 +47,13 @@ ext2_clear_xip_target(struct inode *inode, sector_t block)
 {
 	void *kaddr;
 	unsigned long pfn;
-	int rc;
+	long size;
 
-	rc = __inode_direct_access(inode, block, &kaddr, &pfn);
-	if (!rc)
-		clear_page(kaddr);
-	return rc;
+	size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE);
+	if (size < 0)
+		return size;
+	clear_page(kaddr);
+	return 0;
 }
 
 void ext2_xip_verify_sb(struct super_block *sb)
@@ -77,7 +72,7 @@ void ext2_xip_verify_sb(struct super_block *sb)
 int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
 				void **kmem, unsigned long *pfn)
 {
-	int rc;
+	long rc;
 	sector_t block;
 
 	/* first, retrieve the sector number */
@@ -86,6 +81,6 @@ int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
 		return rc;
 
 	/* retrieve address of the target data */
-	rc = __inode_direct_access(mapping->host, block, kmem, pfn);
-	return rc;
+	rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE);
+	return (rc < 0) ? rc : 0;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 92f4b4b288dd..e9086be6d9a0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1601,8 +1601,8 @@ struct block_device_operations {
 	int (*rw_page)(struct block_device *, sector_t, struct page *, int rw);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
-	int (*direct_access) (struct block_device *, sector_t,
-						void **, unsigned long *);
+	long (*direct_access)(struct block_device *, sector_t,
+					void **, unsigned long *pfn, long size);
 	unsigned int (*check_events) (struct gendisk *disk,
 				      unsigned int clearing);
 	/* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1620,6 +1620,8 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
+extern long bdev_direct_access(struct block_device *, sector_t, void **addr,
+						unsigned long *pfn, long size);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
-- 
cgit v1.2.3


From 92a8b064fd4412d4dd32ef598b6f9e55feef9e51 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 14 Jan 2015 17:21:53 +0100
Subject: mfd: syscon: Add atmel-matrix registers definition

AT91 SoCs have a memory range reserved for internal bus configuration.
Expose those registers so that drivers can make use of the matrix syscon
declared in at91 DTs.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 include/linux/mfd/syscon/atmel-matrix.h | 117 ++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 include/linux/mfd/syscon/atmel-matrix.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/syscon/atmel-matrix.h b/include/linux/mfd/syscon/atmel-matrix.h
new file mode 100644
index 000000000000..8293c3e2a82a
--- /dev/null
+++ b/include/linux/mfd/syscon/atmel-matrix.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (C) 2014 Atmel Corporation.
+ *
+ * Memory Controllers (MATRIX, EBI) - System peripherals registers.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _LINUX_MFD_SYSCON_ATMEL_MATRIX_H
+#define _LINUX_MFD_SYSCON_ATMEL_MATRIX_H
+
+#define AT91SAM9260_MATRIX_MCFG			0x00
+#define AT91SAM9260_MATRIX_SCFG			0x40
+#define AT91SAM9260_MATRIX_PRS			0x80
+#define AT91SAM9260_MATRIX_MRCR			0x100
+#define AT91SAM9260_MATRIX_EBICSA		0x11c
+
+#define AT91SAM9261_MATRIX_MRCR			0x0
+#define AT91SAM9261_MATRIX_SCFG			0x4
+#define AT91SAM9261_MATRIX_TCR			0x24
+#define AT91SAM9261_MATRIX_EBICSA		0x30
+#define AT91SAM9261_MATRIX_USBPUCR		0x34
+
+#define AT91SAM9263_MATRIX_MCFG			0x00
+#define AT91SAM9263_MATRIX_SCFG			0x40
+#define AT91SAM9263_MATRIX_PRS			0x80
+#define AT91SAM9263_MATRIX_MRCR			0x100
+#define AT91SAM9263_MATRIX_TCR			0x114
+#define AT91SAM9263_MATRIX_EBI0CSA		0x120
+#define AT91SAM9263_MATRIX_EBI1CSA		0x124
+
+#define AT91SAM9RL_MATRIX_MCFG			0x00
+#define AT91SAM9RL_MATRIX_SCFG			0x40
+#define AT91SAM9RL_MATRIX_PRS			0x80
+#define AT91SAM9RL_MATRIX_MRCR			0x100
+#define AT91SAM9RL_MATRIX_TCR			0x114
+#define AT91SAM9RL_MATRIX_EBICSA		0x120
+
+#define AT91SAM9G45_MATRIX_MCFG			0x00
+#define AT91SAM9G45_MATRIX_SCFG			0x40
+#define AT91SAM9G45_MATRIX_PRS			0x80
+#define AT91SAM9G45_MATRIX_MRCR			0x100
+#define AT91SAM9G45_MATRIX_TCR			0x110
+#define AT91SAM9G45_MATRIX_DDRMPR		0x118
+#define AT91SAM9G45_MATRIX_EBICSA		0x128
+
+#define AT91SAM9N12_MATRIX_MCFG			0x00
+#define AT91SAM9N12_MATRIX_SCFG			0x40
+#define AT91SAM9N12_MATRIX_PRS			0x80
+#define AT91SAM9N12_MATRIX_MRCR			0x100
+#define AT91SAM9N12_MATRIX_EBICSA		0x118
+
+#define AT91SAM9X5_MATRIX_MCFG			0x00
+#define AT91SAM9X5_MATRIX_SCFG			0x40
+#define AT91SAM9X5_MATRIX_PRS			0x80
+#define AT91SAM9X5_MATRIX_MRCR			0x100
+#define AT91SAM9X5_MATRIX_EBICSA		0x120
+
+#define SAMA5D3_MATRIX_MCFG			0x00
+#define SAMA5D3_MATRIX_SCFG			0x40
+#define SAMA5D3_MATRIX_PRS			0x80
+#define SAMA5D3_MATRIX_MRCR			0x100
+
+#define AT91_MATRIX_MCFG(o, x)			((o) + ((x) * 0x4))
+#define AT91_MATRIX_ULBT			GENMASK(2, 0)
+#define AT91_MATRIX_ULBT_INFINITE		(0 << 0)
+#define AT91_MATRIX_ULBT_SINGLE			(1 << 0)
+#define AT91_MATRIX_ULBT_FOUR			(2 << 0)
+#define AT91_MATRIX_ULBT_EIGHT			(3 << 0)
+#define AT91_MATRIX_ULBT_SIXTEEN		(4 << 0)
+
+#define AT91_MATRIX_SCFG(o, x)			((o) + ((x) * 0x4))
+#define AT91_MATRIX_SLOT_CYCLE			GENMASK(7,  0)
+#define AT91_MATRIX_DEFMSTR_TYPE		GENMASK(17, 16)
+#define AT91_MATRIX_DEFMSTR_TYPE_NONE		(0 << 16)
+#define AT91_MATRIX_DEFMSTR_TYPE_LAST		(1 << 16)
+#define AT91_MATRIX_DEFMSTR_TYPE_FIXED		(2 << 16)
+#define AT91_MATRIX_FIXED_DEFMSTR		GENMASK(20, 18)
+#define AT91_MATRIX_ARBT			GENMASK(25, 24)
+#define AT91_MATRIX_ARBT_ROUND_ROBIN		(0 << 24)
+#define AT91_MATRIX_ARBT_FIXED_PRIORITY		(1 << 24)
+
+#define AT91_MATRIX_ITCM_SIZE			GENMASK(3, 0)
+#define AT91_MATRIX_ITCM_0			(0 << 0)
+#define AT91_MATRIX_ITCM_16			(5 << 0)
+#define AT91_MATRIX_ITCM_32			(6 << 0)
+#define AT91_MATRIX_ITCM_64			(7 << 0)
+#define	AT91_MATRIX_DTCM_SIZE			GENMASK(7, 4)
+#define	AT91_MATRIX_DTCM_0			(0 << 4)
+#define	AT91_MATRIX_DTCM_16			(5 << 4)
+#define AT91_MATRIX_DTCM_32			(6 << 4)
+#define AT91_MATRIX_DTCM_64			(7 << 4)
+
+#define AT91_MATRIX_PRAS(o, x)			((o) + ((x) * 0x8))
+#define AT91_MATRIX_PRBS(o, x)			((o) + ((x) * 0x8) + 0x4)
+#define AT91_MATRIX_MPR(x)			GENMASK(((x) * 0x4) + 1, ((x) * 0x4))
+
+#define AT91_MATRIX_RCB(x)			BIT(x)
+
+#define AT91_MATRIX_CSA(cs, val)		(val << (cs))
+#define AT91_MATRIX_DBPUC			BIT(8)
+#define AT91_MATRIX_DBPDC			BIT(9)
+#define AT91_MATRIX_VDDIOMSEL			BIT(16)
+#define AT91_MATRIX_VDDIOMSEL_1_8V		(0 << 16)
+#define AT91_MATRIX_VDDIOMSEL_3_3V		(1 << 16)
+#define AT91_MATRIX_EBI_IOSR			BIT(17)
+#define AT91_MATRIX_DDR_IOSR			BIT(18)
+#define AT91_MATRIX_NFD0_SELECT			BIT(24)
+#define AT91_MATRIX_DDR_MP_EN			BIT(25)
+#define AT91_MATRIX_EBI_NUM_CS			8
+
+#define AT91_MATRIX_USBPUCR_PUON		BIT(30)
+
+#endif /* _LINUX_MFD_SYSCON_ATMEL_MATRIX_H */
-- 
cgit v1.2.3


From 57d8acbacaace9a4d95ebde1c13b94d293ff21f3 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 14 Jan 2015 17:21:55 +0100
Subject: mfd: syscon: Add atmel-smc registers definition

Atmel AT91 SoCs have a memory range reserved for SMC (Static Memory
Controller) configuration.
Expose those registers so that drivers can make use of the smc syscon
declared in at91 DTs.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 include/linux/mfd/syscon/atmel-smc.h | 173 +++++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 include/linux/mfd/syscon/atmel-smc.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/syscon/atmel-smc.h b/include/linux/mfd/syscon/atmel-smc.h
new file mode 100644
index 000000000000..be6ebe64eebe
--- /dev/null
+++ b/include/linux/mfd/syscon/atmel-smc.h
@@ -0,0 +1,173 @@
+/*
+ * Atmel SMC (Static Memory Controller) register offsets and bit definitions.
+ *
+ * Copyright (C) 2014 Atmel
+ * Copyright (C) 2014 Free Electrons
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_MFD_SYSCON_ATMEL_SMC_H_
+#define _LINUX_MFD_SYSCON_ATMEL_SMC_H_
+
+#include <linux/kernel.h>
+#include <linux/regmap.h>
+
+#define AT91SAM9_SMC_GENERIC		0x00
+#define AT91SAM9_SMC_GENERIC_BLK_SZ	0x10
+
+#define SAMA5_SMC_GENERIC		0x600
+#define SAMA5_SMC_GENERIC_BLK_SZ	0x14
+
+#define AT91SAM9_SMC_SETUP(o)		((o) + 0x00)
+#define AT91SAM9_SMC_NWESETUP(x)	(x)
+#define AT91SAM9_SMC_NCS_WRSETUP(x)	((x) << 8)
+#define AT91SAM9_SMC_NRDSETUP(x)	((x) << 16)
+#define AT91SAM9_SMC_NCS_NRDSETUP(x)	((x) << 24)
+
+#define AT91SAM9_SMC_PULSE(o)		((o) + 0x04)
+#define AT91SAM9_SMC_NWEPULSE(x)	(x)
+#define AT91SAM9_SMC_NCS_WRPULSE(x)	((x) << 8)
+#define AT91SAM9_SMC_NRDPULSE(x)	((x) << 16)
+#define AT91SAM9_SMC_NCS_NRDPULSE(x)	((x) << 24)
+
+#define AT91SAM9_SMC_CYCLE(o)		((o) + 0x08)
+#define AT91SAM9_SMC_NWECYCLE(x)	(x)
+#define AT91SAM9_SMC_NRDCYCLE(x)	((x) << 16)
+
+#define AT91SAM9_SMC_MODE(o)		((o) + 0x0c)
+#define SAMA5_SMC_MODE(o)		((o) + 0x10)
+#define AT91_SMC_READMODE		BIT(0)
+#define AT91_SMC_READMODE_NCS		(0 << 0)
+#define AT91_SMC_READMODE_NRD		(1 << 0)
+#define AT91_SMC_WRITEMODE		BIT(1)
+#define AT91_SMC_WRITEMODE_NCS		(0 << 1)
+#define AT91_SMC_WRITEMODE_NWE		(1 << 1)
+#define AT91_SMC_EXNWMODE		GENMASK(5, 4)
+#define AT91_SMC_EXNWMODE_DISABLE	(0 << 4)
+#define AT91_SMC_EXNWMODE_FROZEN	(2 << 4)
+#define AT91_SMC_EXNWMODE_READY		(3 << 4)
+#define AT91_SMC_BAT			BIT(8)
+#define AT91_SMC_BAT_SELECT		(0 << 8)
+#define AT91_SMC_BAT_WRITE		(1 << 8)
+#define AT91_SMC_DBW			GENMASK(13, 12)
+#define AT91_SMC_DBW_8			(0 << 12)
+#define AT91_SMC_DBW_16			(1 << 12)
+#define AT91_SMC_DBW_32			(2 << 12)
+#define AT91_SMC_TDF			GENMASK(19, 16)
+#define AT91_SMC_TDF_(x)		((((x) - 1) << 16) & AT91_SMC_TDF)
+#define AT91_SMC_TDF_MAX		16
+#define AT91_SMC_TDFMODE_OPTIMIZED	BIT(20)
+#define AT91_SMC_PMEN			BIT(24)
+#define AT91_SMC_PS			GENMASK(29, 28)
+#define AT91_SMC_PS_4			(0 << 28)
+#define AT91_SMC_PS_8			(1 << 28)
+#define AT91_SMC_PS_16			(2 << 28)
+#define AT91_SMC_PS_32			(3 << 28)
+
+
+/*
+ * This function converts a setup timing expressed in nanoseconds into an
+ * encoded value that can be written in the SMC_SETUP register.
+ *
+ * The following formula is described in atmel datasheets (section
+ * "SMC Setup Register"):
+ *
+ * setup length = (128* SETUP[5] + SETUP[4:0])
+ *
+ * where setup length is the timing expressed in cycles.
+ */
+static inline u32 at91sam9_smc_setup_ns_to_cycles(unsigned int clk_rate,
+						  u32 timing_ns)
+{
+	u32 clk_period = DIV_ROUND_UP(NSEC_PER_SEC, clk_rate);
+	u32 coded_cycles = 0;
+	u32 cycles;
+
+	cycles = DIV_ROUND_UP(timing_ns, clk_period);
+	if (cycles / 32) {
+		coded_cycles |= 1 << 5;
+		if (cycles < 128)
+			cycles = 0;
+	}
+
+	coded_cycles |= cycles % 32;
+
+	return coded_cycles;
+}
+
+/*
+ * This function converts a pulse timing expressed in nanoseconds into an
+ * encoded value that can be written in the SMC_PULSE register.
+ *
+ * The following formula is described in atmel datasheets (section
+ * "SMC Pulse Register"):
+ *
+ * pulse length = (256* PULSE[6] + PULSE[5:0])
+ *
+ * where pulse length is the timing expressed in cycles.
+ */
+static inline u32 at91sam9_smc_pulse_ns_to_cycles(unsigned int clk_rate,
+						  u32 timing_ns)
+{
+	u32 clk_period = DIV_ROUND_UP(NSEC_PER_SEC, clk_rate);
+	u32 coded_cycles = 0;
+	u32 cycles;
+
+	cycles = DIV_ROUND_UP(timing_ns, clk_period);
+	if (cycles / 64) {
+		coded_cycles |= 1 << 6;
+		if (cycles < 256)
+			cycles = 0;
+	}
+
+	coded_cycles |= cycles % 64;
+
+	return coded_cycles;
+}
+
+/*
+ * This function converts a cycle timing expressed in nanoseconds into an
+ * encoded value that can be written in the SMC_CYCLE register.
+ *
+ * The following formula is described in atmel datasheets (section
+ * "SMC Cycle Register"):
+ *
+ * cycle length = (CYCLE[8:7]*256 + CYCLE[6:0])
+ *
+ * where cycle length is the timing expressed in cycles.
+ */
+static inline u32 at91sam9_smc_cycle_ns_to_cycles(unsigned int clk_rate,
+						  u32 timing_ns)
+{
+	u32 clk_period = DIV_ROUND_UP(NSEC_PER_SEC, clk_rate);
+	u32 coded_cycles = 0;
+	u32 cycles;
+
+	cycles = DIV_ROUND_UP(timing_ns, clk_period);
+	if (cycles / 128) {
+		coded_cycles = cycles / 256;
+		cycles %= 256;
+		if (cycles >= 128) {
+			coded_cycles++;
+			cycles = 0;
+		}
+
+		if (coded_cycles > 0x3) {
+			coded_cycles = 0x3;
+			cycles = 0x7f;
+		}
+
+		coded_cycles <<= 7;
+	}
+
+	coded_cycles |= cycles % 128;
+
+	return coded_cycles;
+}
+
+#endif /* _LINUX_MFD_SYSCON_ATMEL_SMC_H_ */
-- 
cgit v1.2.3


From 2397aa8b515f7bd77c8d5698170b6a98fdd6721c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 13 Jan 2015 11:02:54 -0500
Subject: svcrdma: Clean up read chunk counting

The byte_count argument is not used, and the function is called
only from one place.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h         |  2 --
 net/sunrpc/xprtrdma/svc_rdma_marshal.c  | 16 ----------------
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 15 ++++++++++++---
 3 files changed, 12 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 975da754c778..2280325e4c88 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -178,8 +178,6 @@ struct svcxprt_rdma {
 #define RPCRDMA_MAX_REQ_SIZE    4096
 
 /* svc_rdma_marshal.c */
-extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *,
-				      int *, int *);
 extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
 extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 65b146297f5a..b681855cf970 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -70,22 +70,6 @@ static u32 *decode_read_list(u32 *va, u32 *vaend)
 	return (u32 *)&ch->rc_position;
 }
 
-/*
- * Determine number of chunks and total bytes in chunk list. The chunk
- * list has already been verified to fit within the RPCRDMA header.
- */
-void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
-			       int *ch_count, int *byte_count)
-{
-	/* compute the number of bytes represented by read chunks */
-	*byte_count = 0;
-	*ch_count = 0;
-	for (; ch->rc_discrim != 0; ch++) {
-		*byte_count = *byte_count + ntohl(ch->rc_target.rs_length);
-		*ch_count = *ch_count + 1;
-	}
-}
-
 /*
  * Decodes a write chunk list. The expected format is as follows:
  *    descrim  : xdr_one
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2c67de032009..b3b7bb85844d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -365,12 +365,22 @@ static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
 	return ret;
 }
 
+static unsigned int
+rdma_rcl_chunk_count(struct rpcrdma_read_chunk *ch)
+{
+	unsigned int count;
+
+	for (count = 0; ch->rc_discrim != xdr_zero; ch++)
+		count++;
+	return count;
+}
+
 static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 			    struct rpcrdma_msg *rmsgp,
 			    struct svc_rqst *rqstp,
 			    struct svc_rdma_op_ctxt *head)
 {
-	int page_no, ch_count, ret;
+	int page_no, ret;
 	struct rpcrdma_read_chunk *ch;
 	u32 page_offset, byte_count;
 	u64 rs_offset;
@@ -381,8 +391,7 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	if (!ch)
 		return 0;
 
-	svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
-	if (ch_count > RPCSVC_MAXPAGES)
+	if (rdma_rcl_chunk_count(ch) > RPCSVC_MAXPAGES)
 		return -EINVAL;
 
 	/* The request is completed when the RDMA_READs complete. The
-- 
cgit v1.2.3


From e54524111f51eac1900cf91aca3d38a92a6b11c0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 13 Jan 2015 11:03:20 -0500
Subject: svcrdma: Plant reader function in struct svcxprt_rdma

The RDMA reader function doesn't change once an svcxprt_rdma is
instantiated. Instead of checking sc_devcap during every incoming
RPC, set the reader function once when the connection is accepted.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          | 10 +++++
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 71 ++++++++++++--------------------
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  2 +
 3 files changed, 39 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 2280325e4c88..f161e309f25e 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -150,6 +150,10 @@ struct svcxprt_rdma {
 	struct ib_cq         *sc_rq_cq;
 	struct ib_cq         *sc_sq_cq;
 	struct ib_mr         *sc_phys_mr;	/* MR for server memory */
+	int		     (*sc_reader)(struct svcxprt_rdma *,
+					  struct svc_rqst *,
+					  struct svc_rdma_op_ctxt *,
+					  int *, u32 *, u32, u32, u64, bool);
 	u32		     sc_dev_caps;	/* distilled device caps */
 	u32		     sc_dma_lkey;	/* local dma key */
 	unsigned int	     sc_frmr_pg_list_len;
@@ -195,6 +199,12 @@ extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
 
 /* svc_rdma_recvfrom.c */
 extern int svc_rdma_recvfrom(struct svc_rqst *);
+extern int rdma_read_chunk_lcl(struct svcxprt_rdma *, struct svc_rqst *,
+			       struct svc_rdma_op_ctxt *, int *, u32 *,
+			       u32, u32, u64, bool);
+extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
+				struct svc_rdma_op_ctxt *, int *, u32 *,
+				u32, u32, u64, bool);
 
 /* svc_rdma_sendto.c */
 extern int svc_rdma_sendto(struct svc_rqst *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 577f8659ca30..c3aebc1bf0a6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -117,26 +117,16 @@ static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
 		return min_t(int, sge_count, xprt->sc_max_sge);
 }
 
-typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt,
-			      struct svc_rqst *rqstp,
-			      struct svc_rdma_op_ctxt *head,
-			      int *page_no,
-			      u32 *page_offset,
-			      u32 rs_handle,
-			      u32 rs_length,
-			      u64 rs_offset,
-			      int last);
-
 /* Issue an RDMA_READ using the local lkey to map the data sink */
-static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
-			       struct svc_rqst *rqstp,
-			       struct svc_rdma_op_ctxt *head,
-			       int *page_no,
-			       u32 *page_offset,
-			       u32 rs_handle,
-			       u32 rs_length,
-			       u64 rs_offset,
-			       int last)
+int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
+			struct svc_rqst *rqstp,
+			struct svc_rdma_op_ctxt *head,
+			int *page_no,
+			u32 *page_offset,
+			u32 rs_handle,
+			u32 rs_length,
+			u64 rs_offset,
+			bool last)
 {
 	struct ib_send_wr read_wr;
 	int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
@@ -221,15 +211,15 @@ static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
 }
 
 /* Issue an RDMA_READ using an FRMR to map the data sink */
-static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
-				struct svc_rqst *rqstp,
-				struct svc_rdma_op_ctxt *head,
-				int *page_no,
-				u32 *page_offset,
-				u32 rs_handle,
-				u32 rs_length,
-				u64 rs_offset,
-				int last)
+int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
+			 struct svc_rqst *rqstp,
+			 struct svc_rdma_op_ctxt *head,
+			 int *page_no,
+			 u32 *page_offset,
+			 u32 rs_handle,
+			 u32 rs_length,
+			 u64 rs_offset,
+			 bool last)
 {
 	struct ib_send_wr read_wr;
 	struct ib_send_wr inv_wr;
@@ -374,9 +364,9 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 {
 	int page_no, ret;
 	struct rpcrdma_read_chunk *ch;
-	u32 page_offset, byte_count;
+	u32 handle, page_offset, byte_count;
 	u64 rs_offset;
-	rdma_reader_fn reader;
+	bool last;
 
 	/* If no read list is present, return 0 */
 	ch = svc_rdma_get_read_chunk(rmsgp);
@@ -399,27 +389,20 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	head->arg.len = rqstp->rq_arg.len;
 	head->arg.buflen = rqstp->rq_arg.buflen;
 
-	/* Use FRMR if supported */
-	if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)
-		reader = rdma_read_chunk_frmr;
-	else
-		reader = rdma_read_chunk_lcl;
-
 	page_no = 0; page_offset = 0;
 	for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
 	     ch->rc_discrim != 0; ch++) {
-
+		handle = be32_to_cpu(ch->rc_target.rs_handle);
+		byte_count = be32_to_cpu(ch->rc_target.rs_length);
 		xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset,
 				 &rs_offset);
-		byte_count = ntohl(ch->rc_target.rs_length);
 
 		while (byte_count > 0) {
-			ret = reader(xprt, rqstp, head,
-				     &page_no, &page_offset,
-				     ntohl(ch->rc_target.rs_handle),
-				     byte_count, rs_offset,
-				     ((ch+1)->rc_discrim == 0) /* last */
-				     );
+			last = (ch + 1)->rc_discrim == xdr_zero;
+			ret = xprt->sc_reader(xprt, rqstp, head,
+					      &page_no, &page_offset,
+					      handle, byte_count,
+					      rs_offset, last);
 			if (ret < 0)
 				goto err;
 			byte_count -= ret;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index f2e059bbab42..f609c1c2d38d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -974,10 +974,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	 * NB:	iWARP requires remote write access for the data sink
 	 *	of an RDMA_READ. IB does not.
 	 */
+	newxprt->sc_reader = rdma_read_chunk_lcl;
 	if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
 		newxprt->sc_frmr_pg_list_len =
 			devattr.max_fast_reg_page_list_len;
 		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
+		newxprt->sc_reader = rdma_read_chunk_frmr;
 	}
 
 	/*
-- 
cgit v1.2.3


From 0b056c224bea63060ce8a981e84193c93fac6f5d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 13 Jan 2015 11:03:37 -0500
Subject: svcrdma: Support RDMA_NOMSG requests

Currently the Linux server can not decode RDMA_NOMSG type requests.
Operations whose length exceeds the fixed size of RDMA SEND buffers,
like large NFSv4 CREATE(NF4LNK) operations, must be conveyed via
RDMA_NOMSG.

For an RDMA_MSG type request, the client sends the RPC/RDMA, RPC
headers, and some or all of the NFS arguments via RDMA SEND.

For an RDMA_NOMSG type request, the client sends just the RPC/RDMA
header via RDMA SEND. The request's read list contains elements for
the entire RPC message, including the RPC header.

NFSD expects the RPC/RMDA header and RPC header to be contiguous in
page zero of the XDR buffer. Add logic in the RDMA READ path to make
the read list contents land where the server prefers, when the
incoming message is a type RDMA_NOMSG message.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h         |  1 +
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 39 ++++++++++++++++++++++++++++++---
 2 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index f161e309f25e..c343a94bc791 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -79,6 +79,7 @@ struct svc_rdma_op_ctxt {
 	enum ib_wr_opcode wr_op;
 	enum ib_wc_status wc_status;
 	u32 byte_len;
+	u32 position;
 	struct svcxprt_rdma *xprt;
 	unsigned long flags;
 	enum dma_data_direction direction;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index a67dd1a081dd..36cf51a3eab7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -60,6 +60,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 			       struct svc_rdma_op_ctxt *ctxt,
 			       u32 byte_count)
 {
+	struct rpcrdma_msg *rmsgp;
 	struct page *page;
 	u32 bc;
 	int sge_no;
@@ -82,7 +83,14 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 	/* If data remains, store it in the pagelist */
 	rqstp->rq_arg.page_len = bc;
 	rqstp->rq_arg.page_base = 0;
-	rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+
+	/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+	if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG)
+		rqstp->rq_arg.pages = &rqstp->rq_pages[0];
+	else
+		rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+
 	sge_no = 1;
 	while (bc && sge_no < ctxt->count) {
 		page = ctxt->pages[sge_no];
@@ -383,7 +391,6 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	 */
 	head->arg.head[0] = rqstp->rq_arg.head[0];
 	head->arg.tail[0] = rqstp->rq_arg.tail[0];
-	head->arg.pages = &head->pages[head->count];
 	head->hdr_count = head->count;
 	head->arg.page_base = 0;
 	head->arg.page_len = 0;
@@ -393,9 +400,17 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
 	position = be32_to_cpu(ch->rc_position);
 
+	/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+	if (position == 0) {
+		head->arg.pages = &head->pages[0];
+		page_offset = head->byte_len;
+	} else {
+		head->arg.pages = &head->pages[head->count];
+		page_offset = 0;
+	}
+
 	ret = 0;
 	page_no = 0;
-	page_offset = 0;
 	for (; ch->rc_discrim != xdr_zero; ch++) {
 		if (be32_to_cpu(ch->rc_position) != position)
 			goto err;
@@ -418,7 +433,10 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 			head->arg.buflen += ret;
 		}
 	}
+
 	ret = 1;
+	head->position = position;
+
  err:
 	/* Detach arg pages. svc_recv will replenish them */
 	for (page_no = 0;
@@ -465,6 +483,21 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
 		put_page(rqstp->rq_pages[page_no]);
 		rqstp->rq_pages[page_no] = head->pages[page_no];
 	}
+
+	/* Adjustments made for RDMA_NOMSG type requests */
+	if (head->position == 0) {
+		if (head->arg.len <= head->sge[0].length) {
+			head->arg.head[0].iov_len = head->arg.len -
+							head->byte_len;
+			head->arg.page_len = 0;
+		} else {
+			head->arg.head[0].iov_len = head->sge[0].length -
+								head->byte_len;
+			head->arg.page_len = head->arg.len -
+						head->sge[0].length;
+		}
+	}
+
 	/* Point rq_arg.pages past header */
 	rdma_fix_xdr_pad(&head->arg);
 	rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
-- 
cgit v1.2.3


From ba0513b5b8ffbcb0cc89e2f172c0bcb70497ba2e Mon Sep 17 00:00:00 2001
From: Mario Smarduch <m.smarduch@samsung.com>
Date: Thu, 15 Jan 2015 15:58:53 -0800
Subject: KVM: Add generic support for dirty page logging

kvm_get_dirty_log() provides generic handling of dirty bitmap, currently reused
by several architectures. Building on that we intrdoduce
kvm_get_dirty_log_protect() adding write protection to mark these pages dirty
for future write access, before next KVM_GET_DIRTY_LOG ioctl call from user
space.

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---
 include/linux/kvm_host.h |  9 ++++++
 virt/kvm/Kconfig         |  6 ++++
 virt/kvm/kvm_main.c      | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 26f106022c88..3b934cc94cc8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -611,6 +611,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext);
 
 int kvm_get_dirty_log(struct kvm *kvm,
 			struct kvm_dirty_log *log, int *is_dirty);
+
+int kvm_get_dirty_log_protect(struct kvm *kvm,
+			struct kvm_dirty_log *log, bool *is_dirty);
+
+void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
+					struct kvm_memory_slot *slot,
+					gfn_t gfn_offset,
+					unsigned long mask);
+
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 				struct kvm_dirty_log *log);
 
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 3796a2132a06..314950c51d9f 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -40,3 +40,9 @@ config KVM_VFIO
 
 config HAVE_KVM_ARCH_TLB_FLUSH_ALL
        bool
+
+config HAVE_KVM_ARCH_DIRTY_LOG_PROTECT
+       bool
+
+config KVM_GENERIC_DIRTYLOG_READ_PROTECT
+       bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d03bd2255801..246cf291c6fd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -995,6 +995,86 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
 
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+/**
+ * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
+ *	are dirty write protect them for next write.
+ * @kvm:	pointer to kvm instance
+ * @log:	slot id and address to which we copy the log
+ * @is_dirty:	flag set if any page is dirty
+ *
+ * We need to keep it in mind that VCPU threads can write to the bitmap
+ * concurrently. So, to avoid losing track of dirty pages we keep the
+ * following order:
+ *
+ *    1. Take a snapshot of the bit and clear it if needed.
+ *    2. Write protect the corresponding page.
+ *    3. Copy the snapshot to the userspace.
+ *    4. Upon return caller flushes TLB's if needed.
+ *
+ * Between 2 and 4, the guest may write to the page using the remaining TLB
+ * entry.  This is not a problem because the page is reported dirty using
+ * the snapshot taken before and step 4 ensures that writes done after
+ * exiting to userspace will be logged for the next call.
+ *
+ */
+int kvm_get_dirty_log_protect(struct kvm *kvm,
+			struct kvm_dirty_log *log, bool *is_dirty)
+{
+	struct kvm_memory_slot *memslot;
+	int r, i;
+	unsigned long n;
+	unsigned long *dirty_bitmap;
+	unsigned long *dirty_bitmap_buffer;
+
+	r = -EINVAL;
+	if (log->slot >= KVM_USER_MEM_SLOTS)
+		goto out;
+
+	memslot = id_to_memslot(kvm->memslots, log->slot);
+
+	dirty_bitmap = memslot->dirty_bitmap;
+	r = -ENOENT;
+	if (!dirty_bitmap)
+		goto out;
+
+	n = kvm_dirty_bitmap_bytes(memslot);
+
+	dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+	memset(dirty_bitmap_buffer, 0, n);
+
+	spin_lock(&kvm->mmu_lock);
+	*is_dirty = false;
+	for (i = 0; i < n / sizeof(long); i++) {
+		unsigned long mask;
+		gfn_t offset;
+
+		if (!dirty_bitmap[i])
+			continue;
+
+		*is_dirty = true;
+
+		mask = xchg(&dirty_bitmap[i], 0);
+		dirty_bitmap_buffer[i] = mask;
+
+		offset = i * BITS_PER_LONG;
+		kvm_arch_mmu_write_protect_pt_masked(kvm, memslot, offset,
+								mask);
+	}
+
+	spin_unlock(&kvm->mmu_lock);
+
+	r = -EFAULT;
+	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
+		goto out;
+
+	r = 0;
+out:
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
+#endif
+
 bool kvm_largepages_enabled(void)
 {
 	return largepages_enabled;
-- 
cgit v1.2.3


From 67c2b9cb30f561325b010e046b7bbe2a327e69a0 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Tue, 18 Nov 2014 12:18:18 +0100
Subject: ARM: 8207/1: amba: Use inlines instead of macros for
 amba_pclk_enable/disable

Replace the amba_pclk_enable and amba_pclk_disable macros with static
inline functions and remove checks for IS_ERR. The amba bus clock won't
be ERR because probe would fail before the use of these functions.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/amba/bus.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 2afc618b15ce..0ab5f8e0dea2 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -92,11 +92,15 @@ struct amba_device *amba_find_device(const char *, struct device *, unsigned int
 int amba_request_regions(struct amba_device *, const char *);
 void amba_release_regions(struct amba_device *);
 
-#define amba_pclk_enable(d)	\
-	(IS_ERR((d)->pclk) ? 0 : clk_enable((d)->pclk))
+static inline int amba_pclk_enable(struct amba_device *dev)
+{
+	return clk_enable(dev->pclk);
+}
 
-#define amba_pclk_disable(d)	\
-	do { if (!IS_ERR((d)->pclk)) clk_disable((d)->pclk); } while (0)
+static inline void amba_pclk_disable(struct amba_device *dev)
+{
+	clk_disable(dev->pclk);
+}
 
 static inline int amba_pclk_prepare(struct amba_device *dev)
 {
-- 
cgit v1.2.3


From 78e1f974dd351ac82d978e3aac2d27422013f914 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Sun, 14 Dec 2014 02:37:09 +0200
Subject: iommu/ipmmu-vmsa: Remove platform data support

No board file instantiates the IPMMU using platform data. Now that we
have DT support, get rid of platform data.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
---
 drivers/iommu/ipmmu-vmsa.c               | 24 ------------------------
 include/linux/platform_data/ipmmu-vmsa.h | 24 ------------------------
 2 files changed, 48 deletions(-)
 delete mode 100644 include/linux/platform_data/ipmmu-vmsa.h

(limited to 'include/linux')

diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index 5d080cf11ba5..791c3daec7c0 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -17,7 +17,6 @@
 #include <linux/iommu.h>
 #include <linux/module.h>
 #include <linux/of.h>
-#include <linux/platform_data/ipmmu-vmsa.h>
 #include <linux/platform_device.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
@@ -30,7 +29,6 @@ struct ipmmu_vmsa_device {
 	void __iomem *base;
 	struct list_head list;
 
-	const struct ipmmu_vmsa_platform_data *pdata;
 	unsigned int num_utlbs;
 
 	struct dma_iommu_mapping *mapping;
@@ -1015,27 +1013,6 @@ static int ipmmu_find_utlbs(struct ipmmu_vmsa_device *mmu, struct device *dev,
 	unsigned int i;
 	int count;
 
-	if (mmu->pdata) {
-		const struct ipmmu_vmsa_master *master = mmu->pdata->masters;
-		const char *devname = dev_name(dev);
-		unsigned int i;
-
-		for (i = 0; i < mmu->pdata->num_masters; ++i, ++master) {
-			if (strcmp(master->name, devname) == 0) {
-				utlbs = kmalloc(sizeof(*utlbs), GFP_KERNEL);
-				if (!utlbs)
-					return -ENOMEM;
-
-				utlbs[0] = master->utlb;
-
-				*_utlbs = utlbs;
-				return 1;
-			}
-		}
-
-		return -EINVAL;
-	}
-
 	count = of_count_phandle_with_args(dev->of_node, "iommus",
 					   "#iommu-cells");
 	if (count < 0)
@@ -1246,7 +1223,6 @@ static int ipmmu_probe(struct platform_device *pdev)
 	}
 
 	mmu->dev = &pdev->dev;
-	mmu->pdata = pdev->dev.platform_data;
 	mmu->num_utlbs = 32;
 
 	/* Map I/O memory and request IRQ. */
diff --git a/include/linux/platform_data/ipmmu-vmsa.h b/include/linux/platform_data/ipmmu-vmsa.h
deleted file mode 100644
index 5275b3ac6d37..000000000000
--- a/include/linux/platform_data/ipmmu-vmsa.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * IPMMU VMSA Platform Data
- *
- * Copyright (C) 2014 Renesas Electronics Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- */
-
-#ifndef __IPMMU_VMSA_H__
-#define __IPMMU_VMSA_H__
-
-struct ipmmu_vmsa_master {
-	const char *name;
-	unsigned int utlb;
-};
-
-struct ipmmu_vmsa_platform_data {
-	const struct ipmmu_vmsa_master *masters;
-	unsigned int num_masters;
-};
-
-#endif /* __IPMMU_VMSA_H__ */
-- 
cgit v1.2.3


From b9626f32876debc356fadb6e19aebcfe9d70c5ff Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Mon, 1 Dec 2014 19:32:51 +0100
Subject: tpm/tpm_i2c_stm_st33: Add new tpm_stm_dev structure and remove
 tpm_i2c_buffer[0], [1] buffer.

In order to clean big buffers in st33zp24_platform_data structure,
replace with tpm_stm_dev for driver internal usage.
As only one buffer is really necessary replace with buf field.

In the mean time move tpm_i2c_stm_st33.h to include/linux/platform_data.

Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Peter Huewe <peterhuewe@gmx.de>
---
 drivers/char/tpm/tpm_i2c_stm_st33.c            | 204 +++++++++++--------------
 drivers/char/tpm/tpm_i2c_stm_st33.h            |  44 ------
 include/linux/platform_data/tpm_i2c_stm_st33.h |  40 +++++
 3 files changed, 132 insertions(+), 156 deletions(-)
 delete mode 100644 drivers/char/tpm/tpm_i2c_stm_st33.h
 create mode 100644 include/linux/platform_data/tpm_i2c_stm_st33.h

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm_i2c_stm_st33.c b/drivers/char/tpm/tpm_i2c_stm_st33.c
index 44a02103ad49..4f725386385f 100644
--- a/drivers/char/tpm/tpm_i2c_stm_st33.c
+++ b/drivers/char/tpm/tpm_i2c_stm_st33.c
@@ -40,7 +40,6 @@
 #include <linux/wait.h>
 #include <linux/string.h>
 #include <linux/interrupt.h>
-#include <linux/spinlock.h>
 #include <linux/sysfs.h>
 #include <linux/gpio.h>
 #include <linux/sched.h>
@@ -48,6 +47,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 
+#include <linux/platform_data/tpm_i2c_stm_st33.h>
 #include "tpm.h"
 
 #define TPM_ACCESS			0x0
@@ -66,7 +66,7 @@
 #define TPM_BUFSIZE			2048
 
 #define LOCALITY0		0
-#include "tpm_i2c_stm_st33.h"
+
 
 enum stm33zp24_access {
 	TPM_ACCESS_VALID = 0x80,
@@ -98,6 +98,15 @@ enum tis_defaults {
 	TIS_LONG_TIMEOUT = 2000,
 };
 
+struct tpm_stm_dev {
+	struct i2c_client *client;
+	struct completion irq_detection;
+	struct tpm_chip *chip;
+	u8 buf[TPM_BUFSIZE + 1];
+	int io_serirq;
+	int io_lpcpd;
+};
+
 /*
  * write8_reg
  * Send byte to the TIS register according to the ST33ZP24 I2C protocol.
@@ -106,17 +115,12 @@ enum tis_defaults {
  * @param: tpm_size, The length of the data
  * @return: Returns negative errno, or else the number of bytes written.
  */
-static int write8_reg(struct i2c_client *client, u8 tpm_register,
+static int write8_reg(struct tpm_stm_dev *tpm_dev, u8 tpm_register,
 		      u8 *tpm_data, u16 tpm_size)
 {
-	struct st33zp24_platform_data *pin_infos;
-
-	pin_infos = client->dev.platform_data;
-
-	pin_infos->tpm_i2c_buffer[0][0] = tpm_register;
-	memcpy(&pin_infos->tpm_i2c_buffer[0][1], tpm_data, tpm_size);
-	return i2c_master_send(client, pin_infos->tpm_i2c_buffer[0],
-				tpm_size + 1);
+	tpm_dev->buf[0] = tpm_register;
+	memcpy(tpm_dev->buf + 1, tpm_data, tpm_size);
+	return i2c_master_send(tpm_dev->client, tpm_dev->buf, tpm_size + 1);
 } /* write8_reg() */
 
 /*
@@ -127,56 +131,56 @@ static int write8_reg(struct i2c_client *client, u8 tpm_register,
  * @param: tpm_size, tpm TPM response size to read.
  * @return: number of byte read successfully: should be one if success.
  */
-static int read8_reg(struct i2c_client *client, u8 tpm_register,
+static int read8_reg(struct tpm_stm_dev *tpm_dev, u8 tpm_register,
 		    u8 *tpm_data, int tpm_size)
 {
 	u8 status = 0;
 	u8 data;
 
 	data = TPM_DUMMY_BYTE;
-	status = write8_reg(client, tpm_register, &data, 1);
+	status = write8_reg(tpm_dev, tpm_register, &data, 1);
 	if (status == 2)
-		status = i2c_master_recv(client, tpm_data, tpm_size);
+		status = i2c_master_recv(tpm_dev->client, tpm_data, tpm_size);
 	return status;
 } /* read8_reg() */
 
 /*
  * I2C_WRITE_DATA
  * Send byte to the TIS register according to the ST33ZP24 I2C protocol.
- * @param: client, the chip description
+ * @param: tpm_dev, the chip description
  * @param: tpm_register, the tpm tis register where the data should be written
  * @param: tpm_data, the tpm_data to write inside the tpm_register
  * @param: tpm_size, The length of the data
  * @return: number of byte written successfully: should be one if success.
  */
-#define I2C_WRITE_DATA(client, tpm_register, tpm_data, tpm_size) \
-	(write8_reg(client, tpm_register | \
+#define I2C_WRITE_DATA(tpm_dev, tpm_register, tpm_data, tpm_size) \
+	(write8_reg(tpm_dev, tpm_register | \
 	TPM_WRITE_DIRECTION, tpm_data, tpm_size))
 
 /*
  * I2C_READ_DATA
  * Recv byte from the TIS register according to the ST33ZP24 I2C protocol.
- * @param: tpm, the chip description
+ * @param: tpm_dev, the chip description
  * @param: tpm_register, the tpm tis register where the data should be read
  * @param: tpm_data, the TPM response
  * @param: tpm_size, tpm TPM response size to read.
  * @return: number of byte read successfully: should be one if success.
  */
-#define I2C_READ_DATA(client, tpm_register, tpm_data, tpm_size) \
-	(read8_reg(client, tpm_register, tpm_data, tpm_size))
+#define I2C_READ_DATA(tpm_dev, tpm_register, tpm_data, tpm_size) \
+	(read8_reg(tpm_dev, tpm_register, tpm_data, tpm_size))
 
 /*
  * clear_interruption
  * clear the TPM interrupt register.
  * @param: tpm, the chip description
  */
-static void clear_interruption(struct i2c_client *client)
+static void clear_interruption(struct tpm_stm_dev *tpm_dev)
 {
 	u8 interrupt;
 
-	I2C_READ_DATA(client, TPM_INT_STATUS, &interrupt, 1);
-	I2C_WRITE_DATA(client, TPM_INT_STATUS, &interrupt, 1);
-	I2C_READ_DATA(client, TPM_INT_STATUS, &interrupt, 1);
+	I2C_READ_DATA(tpm_dev, TPM_INT_STATUS, &interrupt, 1);
+	I2C_WRITE_DATA(tpm_dev, TPM_INT_STATUS, &interrupt, 1);
+	I2C_READ_DATA(tpm_dev, TPM_INT_STATUS, &interrupt, 1);
 } /* clear_interruption() */
 
 /*
@@ -190,17 +194,16 @@ static long _wait_for_interrupt_serirq_timeout(struct tpm_chip *chip,
 {
 	long status;
 	struct i2c_client *client;
-	struct st33zp24_platform_data *pin_infos;
+	struct tpm_stm_dev *tpm_dev;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
-	pin_infos = client->dev.platform_data;
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
+	client = tpm_dev->client;
 
 	status = wait_for_completion_interruptible_timeout(
-					&pin_infos->irq_detection,
-						timeout);
+					&tpm_dev->irq_detection,
+					timeout);
 	if (status > 0)
-		enable_irq(gpio_to_irq(pin_infos->io_serirq));
-	gpio_direction_input(pin_infos->io_serirq);
+		enable_irq(client->irq);
 
 	return status;
 } /* wait_for_interrupt_serirq_timeout() */
@@ -209,15 +212,15 @@ static int wait_for_serirq_timeout(struct tpm_chip *chip, bool condition,
 				 unsigned long timeout)
 {
 	int status = 2;
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	status = _wait_for_interrupt_serirq_timeout(chip, timeout);
 	if (!status) {
 		status = -EBUSY;
 	} else {
-		clear_interruption(client);
+		clear_interruption(tpm_dev);
 		if (condition)
 			status = 1;
 	}
@@ -230,13 +233,13 @@ static int wait_for_serirq_timeout(struct tpm_chip *chip, bool condition,
  */
 static void tpm_stm_i2c_cancel(struct tpm_chip *chip)
 {
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	data = TPM_STS_COMMAND_READY;
-	I2C_WRITE_DATA(client, TPM_STS, &data, 1);
+	I2C_WRITE_DATA(tpm_dev, TPM_STS, &data, 1);
 	if (chip->vendor.irq)
 		wait_for_serirq_timeout(chip, 1, chip->vendor.timeout_a);
 }	/* tpm_stm_i2c_cancel() */
@@ -248,12 +251,12 @@ static void tpm_stm_i2c_cancel(struct tpm_chip *chip)
  */
 static u8 tpm_stm_i2c_status(struct tpm_chip *chip)
 {
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
-	I2C_READ_DATA(client, TPM_STS, &data, 1);
+	I2C_READ_DATA(tpm_dev, TPM_STS, &data, 1);
 	return data;
 }				/* tpm_stm_i2c_status() */
 
@@ -265,13 +268,13 @@ static u8 tpm_stm_i2c_status(struct tpm_chip *chip)
  */
 static int check_locality(struct tpm_chip *chip)
 {
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 	u8 status;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
-	status = I2C_READ_DATA(client, TPM_ACCESS, &data, 1);
+	status = I2C_READ_DATA(tpm_dev, TPM_ACCESS, &data, 1);
 	if (status && (data &
 		(TPM_ACCESS_ACTIVE_LOCALITY | TPM_ACCESS_VALID)) ==
 		(TPM_ACCESS_ACTIVE_LOCALITY | TPM_ACCESS_VALID))
@@ -290,16 +293,16 @@ static int request_locality(struct tpm_chip *chip)
 {
 	unsigned long stop;
 	long rc;
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	if (check_locality(chip) == chip->vendor.locality)
 		return chip->vendor.locality;
 
 	data = TPM_ACCESS_REQUEST_USE;
-	rc = I2C_WRITE_DATA(client, TPM_ACCESS, &data, 1);
+	rc = I2C_WRITE_DATA(tpm_dev, TPM_ACCESS, &data, 1);
 	if (rc < 0)
 		goto end;
 
@@ -328,13 +331,13 @@ end:
  */
 static void release_locality(struct tpm_chip *chip)
 {
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 	data = TPM_ACCESS_ACTIVE_LOCALITY;
 
-	I2C_WRITE_DATA(client, TPM_ACCESS, &data, 1);
+	I2C_WRITE_DATA(tpm_dev, TPM_ACCESS, &data, 1);
 }
 
 /*
@@ -347,19 +350,20 @@ static int get_burstcount(struct tpm_chip *chip)
 	unsigned long stop;
 	int burstcnt, status;
 	u8 tpm_reg, temp;
+	struct tpm_stm_dev *tpm_dev;
 
-	struct i2c_client *client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	stop = jiffies + chip->vendor.timeout_d;
 	do {
 		tpm_reg = TPM_STS + 1;
-		status = I2C_READ_DATA(client, tpm_reg, &temp, 1);
+		status = I2C_READ_DATA(tpm_dev, tpm_reg, &temp, 1);
 		if (status < 0)
 			goto end;
 
 		tpm_reg = tpm_reg + 1;
 		burstcnt = temp;
-		status = I2C_READ_DATA(client, tpm_reg, &temp, 1);
+		status = I2C_READ_DATA(tpm_dev, tpm_reg, &temp, 1);
 		if (status < 0)
 			goto end;
 
@@ -416,9 +420,9 @@ static int wait_for_stat(struct tpm_chip *chip, u8 mask, unsigned long timeout,
 static int recv_data(struct tpm_chip *chip, u8 *buf, size_t count)
 {
 	int size = 0, burstcnt, len;
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	while (size < count &&
 	       wait_for_stat(chip,
@@ -430,7 +434,7 @@ static int recv_data(struct tpm_chip *chip, u8 *buf, size_t count)
 		if (burstcnt < 0)
 			return burstcnt;
 		len = min_t(int, burstcnt, count - size);
-		I2C_READ_DATA(client, TPM_DATA_FIFO, buf + size, len);
+		I2C_READ_DATA(tpm_dev, TPM_DATA_FIFO, buf + size, len);
 		size += len;
 	}
 	return size;
@@ -446,14 +450,14 @@ static irqreturn_t tpm_ioserirq_handler(int irq, void *dev_id)
 {
 	struct tpm_chip *chip = dev_id;
 	struct i2c_client *client;
-	struct st33zp24_platform_data *pin_infos;
+	struct tpm_stm_dev *tpm_dev;
 
 	disable_irq_nosync(irq);
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
-	pin_infos = client->dev.platform_data;
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
+	client = tpm_dev->client;
 
-	complete(&pin_infos->irq_detection);
+	complete(&tpm_dev->irq_detection);
 	return IRQ_HANDLED;
 } /* tpm_ioserirq_handler() */
 
@@ -475,13 +479,15 @@ static int tpm_stm_i2c_send(struct tpm_chip *chip, unsigned char *buf,
 	int ret;
 	u8 data;
 	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 
 	if (chip == NULL)
 		return -EBUSY;
 	if (len < TPM_HEADER_SIZE)
 		return -EBUSY;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
+	client = tpm_dev->client;
 
 	client->flags = 0;
 
@@ -505,7 +511,7 @@ static int tpm_stm_i2c_send(struct tpm_chip *chip, unsigned char *buf,
 		if (burstcnt < 0)
 			return burstcnt;
 		size = min_t(int, len - i - 1, burstcnt);
-		ret = I2C_WRITE_DATA(client, TPM_DATA_FIFO, buf + i, size);
+		ret = I2C_WRITE_DATA(tpm_dev, TPM_DATA_FIFO, buf + i, size);
 		if (ret < 0)
 			goto out_err;
 
@@ -518,7 +524,7 @@ static int tpm_stm_i2c_send(struct tpm_chip *chip, unsigned char *buf,
 		goto out_err;
 	}
 
-	ret = I2C_WRITE_DATA(client, TPM_DATA_FIFO, buf + len - 1, 1);
+	ret = I2C_WRITE_DATA(tpm_dev, TPM_DATA_FIFO, buf + len - 1, 1);
 	if (ret < 0)
 		goto out_err;
 
@@ -529,7 +535,7 @@ static int tpm_stm_i2c_send(struct tpm_chip *chip, unsigned char *buf,
 	}
 
 	data = TPM_STS_GO;
-	I2C_WRITE_DATA(client, TPM_STS, &data, 1);
+	I2C_WRITE_DATA(tpm_dev, TPM_STS, &data, 1);
 
 	return len;
 out_err:
@@ -623,6 +629,7 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	u8 intmask;
 	struct tpm_chip *chip;
 	struct st33zp24_platform_data *platform_data;
+	struct tpm_stm_dev *tpm_dev;
 
 	if (client == NULL) {
 		pr_info("%s: i2c client is NULL. Device not accessible.\n",
@@ -637,11 +644,11 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		goto end;
 	}
 
-	chip = tpm_register_hardware(&client->dev, &st_i2c_tpm);
-	if (!chip) {
-		dev_info(&client->dev, "fail chip\n");
-		err = -ENODEV;
-		goto end;
+	tpm_dev = devm_kzalloc(&client->dev, sizeof(struct tpm_stm_dev),
+			       GFP_KERNEL);
+	if (!tpm_dev) {
+		err = -ENOMEM;
+		goto _tpm_clean_answer;
 	}
 
 	platform_data = client->dev.platform_data;
@@ -652,20 +659,14 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		goto _tpm_clean_answer;
 	}
 
-	platform_data->tpm_i2c_buffer[0] =
-	    kmalloc(TPM_BUFSIZE, GFP_KERNEL);
-	if (platform_data->tpm_i2c_buffer[0] == NULL) {
-		err = -ENOMEM;
-		goto _tpm_clean_answer;
-	}
-	platform_data->tpm_i2c_buffer[1] =
-	    kmalloc(TPM_BUFSIZE, GFP_KERNEL);
-	if (platform_data->tpm_i2c_buffer[1] == NULL) {
-		err = -ENOMEM;
-		goto _tpm_clean_response1;
+	chip = tpm_register_hardware(&client->dev, &st_i2c_tpm);
+	if (!chip) {
+		dev_info(&client->dev, "fail chip\n");
+		return -ENODEV;
 	}
 
-	TPM_VPRIV(chip) = client;
+	TPM_VPRIV(chip) = tpm_dev;
+	tpm_dev->client = client;
 
 	chip->vendor.timeout_a = msecs_to_jiffies(TIS_SHORT_TIMEOUT);
 	chip->vendor.timeout_b = msecs_to_jiffies(TIS_LONG_TIMEOUT);
@@ -682,16 +683,16 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	}
 
 	if (interrupts) {
-		init_completion(&platform_data->irq_detection);
+		init_completion(&tpm_dev->irq_detection);
 		if (request_locality(chip) != LOCALITY0) {
 			err = -ENODEV;
-			goto _tpm_clean_response2;
+			goto _tpm_clean_answer;
 		}
 		err = gpio_request(platform_data->io_serirq, "TPM IO_SERIRQ");
 		if (err)
 			goto _gpio_init2;
 
-		clear_interruption(client);
+		clear_interruption(tpm_dev);
 		err = request_irq(gpio_to_irq(platform_data->io_serirq),
 				&tpm_ioserirq_handler,
 				IRQF_TRIGGER_HIGH,
@@ -702,7 +703,7 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 			goto _irq_set;
 		}
 
-		err = I2C_READ_DATA(client, TPM_INT_ENABLE, &intmask, 1);
+		err = I2C_READ_DATA(tpm_dev, TPM_INT_ENABLE, &intmask, 1);
 		if (err < 0)
 			goto _irq_set;
 
@@ -713,16 +714,17 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 			|  TPM_INTF_STS_VALID_INT
 			|  TPM_INTF_DATA_AVAIL_INT;
 
-		err = I2C_WRITE_DATA(client, TPM_INT_ENABLE, &intmask, 1);
+		err = I2C_WRITE_DATA(tpm_dev, TPM_INT_ENABLE, &intmask, 1);
 		if (err < 0)
 			goto _irq_set;
 
 		intmask = TPM_GLOBAL_INT_ENABLE;
-		err = I2C_WRITE_DATA(client, (TPM_INT_ENABLE + 3), &intmask, 1);
+		err = I2C_WRITE_DATA(tpm_dev, (TPM_INT_ENABLE + 3),
+				     &intmask, 1);
 		if (err < 0)
 			goto _irq_set;
 
-		err = I2C_READ_DATA(client, TPM_INT_STATUS, &intmask, 1);
+		err = I2C_READ_DATA(tpm_dev, TPM_INT_STATUS, &intmask, 1);
 		if (err < 0)
 			goto _irq_set;
 
@@ -744,12 +746,6 @@ _gpio_init2:
 _gpio_init1:
 	if (power_mgt)
 		gpio_free(platform_data->io_lpcpd);
-_tpm_clean_response2:
-	kzfree(platform_data->tpm_i2c_buffer[1]);
-	platform_data->tpm_i2c_buffer[1] = NULL;
-_tpm_clean_response1:
-	kzfree(platform_data->tpm_i2c_buffer[0]);
-	platform_data->tpm_i2c_buffer[0] = NULL;
 _tpm_clean_answer:
 	tpm_remove_hardware(chip->dev);
 end:
@@ -765,28 +761,12 @@ end:
  */
 static int tpm_st33_i2c_remove(struct i2c_client *client)
 {
-	struct tpm_chip *chip = (struct tpm_chip *)i2c_get_clientdata(client);
-	struct st33zp24_platform_data *pin_infos =
-		((struct i2c_client *)TPM_VPRIV(chip))->dev.platform_data;
-
-	if (pin_infos != NULL) {
-		free_irq(pin_infos->io_serirq, chip);
-
-		gpio_free(pin_infos->io_serirq);
-		gpio_free(pin_infos->io_lpcpd);
+	struct tpm_chip *chip =
+		(struct tpm_chip *) i2c_get_clientdata(client);
 
+	if (chip)
 		tpm_remove_hardware(chip->dev);
 
-		if (pin_infos->tpm_i2c_buffer[1] != NULL) {
-			kzfree(pin_infos->tpm_i2c_buffer[1]);
-			pin_infos->tpm_i2c_buffer[1] = NULL;
-		}
-		if (pin_infos->tpm_i2c_buffer[0] != NULL) {
-			kzfree(pin_infos->tpm_i2c_buffer[0]);
-			pin_infos->tpm_i2c_buffer[0] = NULL;
-		}
-	}
-
 	return 0;
 }
 
diff --git a/drivers/char/tpm/tpm_i2c_stm_st33.h b/drivers/char/tpm/tpm_i2c_stm_st33.h
deleted file mode 100644
index 3041271342eb..000000000000
--- a/drivers/char/tpm/tpm_i2c_stm_st33.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * STMicroelectronics TPM I2C Linux driver for TPM ST33ZP24
- * Copyright (C) 2009, 2010  STMicroelectronics
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * STMicroelectronics version 1.2.0, Copyright (C) 2010
- * STMicroelectronics comes with ABSOLUTELY NO WARRANTY.
- * This is free software, and you are welcome to redistribute it
- * under certain conditions.
- *
- * @Author: Christophe RICARD tpmsupport@st.com
- *
- * @File: stm_st33_tpm_i2c.h
- *
- * @Date: 09/15/2010
- */
-#ifndef __STM_ST33_TPM_I2C_MAIN_H__
-#define __STM_ST33_TPM_I2C_MAIN_H__
-
-#define TPM_ST33_I2C			"st33zp24_i2c"
-
-struct st33zp24_platform_data {
-	int io_serirq;
-	int io_lpcpd;
-	struct i2c_client *client;
-	u8 *tpm_i2c_buffer[2]; /* 0 Request 1 Response */
-	struct completion irq_detection;
-	struct mutex lock;
-};
-
-#endif /* __STM_ST33_TPM_I2C_MAIN_H__ */
diff --git a/include/linux/platform_data/tpm_i2c_stm_st33.h b/include/linux/platform_data/tpm_i2c_stm_st33.h
new file mode 100644
index 000000000000..88f9cb18113a
--- /dev/null
+++ b/include/linux/platform_data/tpm_i2c_stm_st33.h
@@ -0,0 +1,40 @@
+/*
+ * STMicroelectronics TPM I2C Linux driver for TPM ST33ZP24
+ * Copyright (C) 2009, 2010  STMicroelectronics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * STMicroelectronics version 1.2.0, Copyright (C) 2010
+ * STMicroelectronics comes with ABSOLUTELY NO WARRANTY.
+ * This is free software, and you are welcome to redistribute it
+ * under certain conditions.
+ *
+ * @Author: Christophe RICARD tpmsupport@st.com
+ *
+ * @File: stm_st33_tpm_i2c.h
+ *
+ * @Date: 09/15/2010
+ */
+#ifndef __STM_ST33_TPM_I2C_MAIN_H__
+#define __STM_ST33_TPM_I2C_MAIN_H__
+
+
+#define TPM_ST33_I2C			"st33zp24_i2c"
+
+struct st33zp24_platform_data {
+	int io_serirq;
+	int io_lpcpd;
+};
+
+#endif /* __STM_ST33_TPM_I2C_MAIN_H__ */
-- 
cgit v1.2.3


From 76182b6b008b694d095aec1e2eb6c07fae787128 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Mon, 1 Dec 2014 19:32:52 +0100
Subject: tpm/tpm_i2c_stm_st33: Remove reference to io_serirq

The serirq gpio pin is used only as interrupt. After driver initialization,
the serirq signal is always used through interrupt and never with gpio
kernel API.

The irq can then be initialized during the platform_data definition within the client->irq pin.

Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Peter Huewe <peterhuewe@gmx.de>
---
 drivers/char/tpm/tpm_i2c_stm_st33.c            | 13 +++----------
 include/linux/platform_data/tpm_i2c_stm_st33.h |  1 -
 2 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm_i2c_stm_st33.c b/drivers/char/tpm/tpm_i2c_stm_st33.c
index 4f725386385f..728611638d1c 100644
--- a/drivers/char/tpm/tpm_i2c_stm_st33.c
+++ b/drivers/char/tpm/tpm_i2c_stm_st33.c
@@ -103,7 +103,6 @@ struct tpm_stm_dev {
 	struct completion irq_detection;
 	struct tpm_chip *chip;
 	u8 buf[TPM_BUFSIZE + 1];
-	int io_serirq;
 	int io_lpcpd;
 };
 
@@ -688,18 +687,15 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 			err = -ENODEV;
 			goto _tpm_clean_answer;
 		}
-		err = gpio_request(platform_data->io_serirq, "TPM IO_SERIRQ");
-		if (err)
-			goto _gpio_init2;
 
 		clear_interruption(tpm_dev);
-		err = request_irq(gpio_to_irq(platform_data->io_serirq),
+		err = request_irq(client->irq,
 				&tpm_ioserirq_handler,
 				IRQF_TRIGGER_HIGH,
 				"TPM SERIRQ management", chip);
 		if (err < 0) {
 			dev_err(chip->dev , "TPM SERIRQ signals %d not available\n",
-				gpio_to_irq(platform_data->io_serirq));
+				client->irq);
 			goto _irq_set;
 		}
 
@@ -739,10 +735,7 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	dev_info(chip->dev, "TPM I2C Initialized\n");
 	return 0;
 _irq_set:
-	free_irq(gpio_to_irq(platform_data->io_serirq), (void *)chip);
-_gpio_init2:
-	if (interrupts)
-		gpio_free(platform_data->io_serirq);
+	free_irq(client->irq, (void *)chip);
 _gpio_init1:
 	if (power_mgt)
 		gpio_free(platform_data->io_lpcpd);
diff --git a/include/linux/platform_data/tpm_i2c_stm_st33.h b/include/linux/platform_data/tpm_i2c_stm_st33.h
index 88f9cb18113a..85775cf5f9a5 100644
--- a/include/linux/platform_data/tpm_i2c_stm_st33.h
+++ b/include/linux/platform_data/tpm_i2c_stm_st33.h
@@ -33,7 +33,6 @@
 #define TPM_ST33_I2C			"st33zp24_i2c"
 
 struct st33zp24_platform_data {
-	int io_serirq;
 	int io_lpcpd;
 };
 
-- 
cgit v1.2.3


From 3eda7d0ea3a0365aa72a2007f9450f314d92f065 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Tue, 13 Jan 2015 23:13:13 +0100
Subject: tpm/tpm_i2c_stm_st33: Change tpm_i2c_stm_st33.h to tpm_stm_st33.h

include/linux/platform_data/tpm_i2c_stm_st33.h can be used by other st33
tpm device driver not using i2c protocol.

Reviewed-by: Jason Gunthorpe <jason.gunthorpe@obsidianresearch.com>
Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Reviewed-by: Peter Huewe <peterhuewe@gmx.de>
Signed-off-by: Peter Huewe <peterhuewe@gmx.de>
---
 drivers/char/tpm/tpm_i2c_stm_st33.c            |  2 +-
 include/linux/platform_data/tpm_i2c_stm_st33.h | 39 --------------------------
 include/linux/platform_data/tpm_stm_st33.h     | 39 ++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 40 deletions(-)
 delete mode 100644 include/linux/platform_data/tpm_i2c_stm_st33.h
 create mode 100644 include/linux/platform_data/tpm_stm_st33.h

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm_i2c_stm_st33.c b/drivers/char/tpm/tpm_i2c_stm_st33.c
index 86a24ced66c0..dbab8d0d875e 100644
--- a/drivers/char/tpm/tpm_i2c_stm_st33.c
+++ b/drivers/char/tpm/tpm_i2c_stm_st33.c
@@ -50,7 +50,7 @@
 #include <linux/of_irq.h>
 #include <linux/of_gpio.h>
 
-#include <linux/platform_data/tpm_i2c_stm_st33.h>
+#include <linux/platform_data/tpm_stm_st33.h>
 #include "tpm.h"
 
 #define TPM_ACCESS			0x0
diff --git a/include/linux/platform_data/tpm_i2c_stm_st33.h b/include/linux/platform_data/tpm_i2c_stm_st33.h
deleted file mode 100644
index 85775cf5f9a5..000000000000
--- a/include/linux/platform_data/tpm_i2c_stm_st33.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * STMicroelectronics TPM I2C Linux driver for TPM ST33ZP24
- * Copyright (C) 2009, 2010  STMicroelectronics
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * STMicroelectronics version 1.2.0, Copyright (C) 2010
- * STMicroelectronics comes with ABSOLUTELY NO WARRANTY.
- * This is free software, and you are welcome to redistribute it
- * under certain conditions.
- *
- * @Author: Christophe RICARD tpmsupport@st.com
- *
- * @File: stm_st33_tpm_i2c.h
- *
- * @Date: 09/15/2010
- */
-#ifndef __STM_ST33_TPM_I2C_MAIN_H__
-#define __STM_ST33_TPM_I2C_MAIN_H__
-
-
-#define TPM_ST33_I2C			"st33zp24_i2c"
-
-struct st33zp24_platform_data {
-	int io_lpcpd;
-};
-
-#endif /* __STM_ST33_TPM_I2C_MAIN_H__ */
diff --git a/include/linux/platform_data/tpm_stm_st33.h b/include/linux/platform_data/tpm_stm_st33.h
new file mode 100644
index 000000000000..ff75310c0f47
--- /dev/null
+++ b/include/linux/platform_data/tpm_stm_st33.h
@@ -0,0 +1,39 @@
+/*
+ * STMicroelectronics TPM I2C Linux driver for TPM ST33ZP24
+ * Copyright (C) 2009, 2010  STMicroelectronics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * STMicroelectronics version 1.2.0, Copyright (C) 2010
+ * STMicroelectronics comes with ABSOLUTELY NO WARRANTY.
+ * This is free software, and you are welcome to redistribute it
+ * under certain conditions.
+ *
+ * @Author: Christophe RICARD tpmsupport@st.com
+ *
+ * @File: stm_st33_tpm.h
+ *
+ * @Date: 09/15/2010
+ */
+#ifndef __STM_ST33_TPM_H__
+#define __STM_ST33_TPM_H__
+
+#define TPM_ST33_I2C			"st33zp24-i2c"
+#define TPM_ST33_SPI			"st33zp24-spi"
+
+struct st33zp24_platform_data {
+	int io_lpcpd;
+};
+
+#endif /* __STM_ST33_TPM_H__ */
-- 
cgit v1.2.3


From 927609d622a3773995f84bc03b4564f873cf0e22 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 25 Nov 2014 10:16:39 +0100
Subject: kernel: tighten rules for ACCESS ONCE

Now that all non-scalar users of ACCESS_ONCE have been converted
to READ_ONCE or ASSIGN once, lets tighten ACCESS_ONCE to only
work on scalar types.
This variant was proposed by Alexei Starovoitov.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/compiler.h | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index a1c81f80978e..5e186bf6f6c1 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -447,12 +447,23 @@ static __always_inline void __assign_once_size(volatile void *p, void *res, int
  * to make the compiler aware of ordering is to put the two invocations of
  * ACCESS_ONCE() in different C statements.
  *
- * This macro does absolutely -nothing- to prevent the CPU from reordering,
- * merging, or refetching absolutely anything at any time.  Its main intended
- * use is to mediate communication between process-level code and irq/NMI
- * handlers, all running on the same CPU.
+ * ACCESS_ONCE will only work on scalar types. For union types, ACCESS_ONCE
+ * on a union member will work as long as the size of the member matches the
+ * size of the union and the size is smaller than word size.
+ *
+ * The major use cases of ACCESS_ONCE used to be (1) Mediating communication
+ * between process-level code and irq/NMI handlers, all running on the same CPU,
+ * and (2) Ensuring that the compiler does not  fold, spindle, or otherwise
+ * mutilate accesses that either do not require ordering or that interact
+ * with an explicit memory barrier or atomic instruction that provides the
+ * required ordering.
+ *
+ * If possible use READ_ONCE/ASSIGN_ONCE instead.
  */
-#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+#define __ACCESS_ONCE(x) ({ \
+	 __maybe_unused typeof(x) __var = 0; \
+	(volatile typeof(x) *)&(x); })
+#define ACCESS_ONCE(x) (*__ACCESS_ONCE(x))
 
 /* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */
 #ifdef CONFIG_KPROBES
-- 
cgit v1.2.3


From c5b19946eb76c67566aae6a84bf2b10ad59295ea Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Mon, 12 Jan 2015 12:13:39 +0100
Subject: kernel: Fix sparse warning for ACCESS_ONCE

Commit 927609d622a3 ("kernel: tighten rules for ACCESS ONCE") results in
sparse warnings like "Using plain integer as NULL pointer" - Let's add a
type cast to the dummy assignment.
To avoid warnings lik "sparse: warning: cast to restricted __hc32" we also
use __force on that cast.

Fixes: 927609d622a3 ("kernel: tighten rules for ACCESS ONCE")
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 include/linux/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 5e186bf6f6c1..7bebf0563d18 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -461,7 +461,7 @@ static __always_inline void __assign_once_size(volatile void *p, void *res, int
  * If possible use READ_ONCE/ASSIGN_ONCE instead.
  */
 #define __ACCESS_ONCE(x) ({ \
-	 __maybe_unused typeof(x) __var = 0; \
+	 __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \
 	(volatile typeof(x) *)&(x); })
 #define ACCESS_ONCE(x) (*__ACCESS_ONCE(x))
 
-- 
cgit v1.2.3


From 85b4545629663486b7f71047ce3b54fa0ad3eb28 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 12 Jan 2015 17:51:14 +0000
Subject: iommu: Consolidate IOVA allocator code

In order to share the IOVA allocator with other architectures, break
the unnecssary dependency on the Intel IOMMU driver and move the
remaining IOVA internals to iova.c

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c | 33 ++-------------------------------
 drivers/iommu/iova.c        | 35 +++++++++++++++++++++++++++++++++++
 include/linux/iova.h        |  3 +++
 3 files changed, 40 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 40dfbc0444c0..e758d8ed8fb5 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -485,7 +485,6 @@ __setup("intel_iommu=", intel_iommu_setup);
 
 static struct kmem_cache *iommu_domain_cache;
 static struct kmem_cache *iommu_devinfo_cache;
-static struct kmem_cache *iommu_iova_cache;
 
 static inline void *alloc_pgtable_page(int node)
 {
@@ -523,16 +522,6 @@ static inline void free_devinfo_mem(void *vaddr)
 	kmem_cache_free(iommu_devinfo_cache, vaddr);
 }
 
-struct iova *alloc_iova_mem(void)
-{
-	return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
-}
-
-void free_iova_mem(struct iova *iova)
-{
-	kmem_cache_free(iommu_iova_cache, iova);
-}
-
 static inline int domain_type_is_vm(struct dmar_domain *domain)
 {
 	return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
@@ -3427,23 +3416,6 @@ static inline int iommu_devinfo_cache_init(void)
 	return ret;
 }
 
-static inline int iommu_iova_cache_init(void)
-{
-	int ret = 0;
-
-	iommu_iova_cache = kmem_cache_create("iommu_iova",
-					 sizeof(struct iova),
-					 0,
-					 SLAB_HWCACHE_ALIGN,
-					 NULL);
-	if (!iommu_iova_cache) {
-		printk(KERN_ERR "Couldn't create iova cache\n");
-		ret = -ENOMEM;
-	}
-
-	return ret;
-}
-
 static int __init iommu_init_mempool(void)
 {
 	int ret;
@@ -3461,7 +3433,7 @@ static int __init iommu_init_mempool(void)
 
 	kmem_cache_destroy(iommu_domain_cache);
 domain_error:
-	kmem_cache_destroy(iommu_iova_cache);
+	iommu_iova_cache_destroy();
 
 	return -ENOMEM;
 }
@@ -3470,8 +3442,7 @@ static void __init iommu_exit_mempool(void)
 {
 	kmem_cache_destroy(iommu_devinfo_cache);
 	kmem_cache_destroy(iommu_domain_cache);
-	kmem_cache_destroy(iommu_iova_cache);
-
+	iommu_iova_cache_destroy();
 }
 
 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index f6b17e6af2fb..520b8c8ae0c4 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -18,6 +18,41 @@
  */
 
 #include <linux/iova.h>
+#include <linux/slab.h>
+
+static struct kmem_cache *iommu_iova_cache;
+
+int iommu_iova_cache_init(void)
+{
+	int ret = 0;
+
+	iommu_iova_cache = kmem_cache_create("iommu_iova",
+					 sizeof(struct iova),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL);
+	if (!iommu_iova_cache) {
+		pr_err("Couldn't create iova cache\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+void iommu_iova_cache_destroy(void)
+{
+	kmem_cache_destroy(iommu_iova_cache);
+}
+
+struct iova *alloc_iova_mem(void)
+{
+	return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
+}
+
+void free_iova_mem(struct iova *iova)
+{
+	kmem_cache_free(iommu_iova_cache, iova);
+}
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 19e81d5ccb6d..ad0507c61cc7 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -39,6 +39,9 @@ static inline unsigned long iova_size(struct iova *iova)
 	return iova->pfn_hi - iova->pfn_lo + 1;
 }
 
+int iommu_iova_cache_init(void);
+void iommu_iova_cache_destroy(void);
+
 struct iova *alloc_iova_mem(void);
 void free_iova_mem(struct iova *iova);
 void free_iova(struct iova_domain *iovad, unsigned long pfn);
-- 
cgit v1.2.3


From 1b72250076dde4276acecf3a7da722b185703e78 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 12 Jan 2015 17:51:15 +0000
Subject: iommu: Make IOVA domain low limit flexible

To share the IOVA allocator with other architectures, it needs to
accommodate more general aperture restrictions; move the lower limit
from a compile-time constant to a runtime domain property to allow
IOVA domains with different requirements to co-exist.

Also reword the slightly unclear description of alloc_iova since we're
touching it anyway.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c |  9 ++++++---
 drivers/iommu/iova.c        | 10 ++++++----
 include/linux/iova.h        |  7 +++----
 3 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e758d8ed8fb5..86f9e82b015b 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -71,6 +71,9 @@
 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
 
+/* IO virtual address start page frame number */
+#define IOVA_START_PFN		(1)
+
 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
 #define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
 #define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64))
@@ -1632,7 +1635,7 @@ static int dmar_init_reserved_ranges(void)
 	struct iova *iova;
 	int i;
 
-	init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
+	init_iova_domain(&reserved_iova_list, IOVA_START_PFN, DMA_32BIT_PFN);
 
 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
 		&reserved_rbtree_key);
@@ -1690,7 +1693,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
 	int adjust_width, agaw;
 	unsigned long sagaw;
 
-	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, IOVA_START_PFN, DMA_32BIT_PFN);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
@@ -4313,7 +4316,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 {
 	int adjust_width;
 
-	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, IOVA_START_PFN, DMA_32BIT_PFN);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 520b8c8ae0c4..a3dbba8caa19 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -55,11 +55,13 @@ void free_iova_mem(struct iova *iova)
 }
 
 void
-init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
+init_iova_domain(struct iova_domain *iovad, unsigned long start_pfn,
+	unsigned long pfn_32bit)
 {
 	spin_lock_init(&iovad->iova_rbtree_lock);
 	iovad->rbroot = RB_ROOT;
 	iovad->cached32_node = NULL;
+	iovad->start_pfn = start_pfn;
 	iovad->dma_32bit_pfn = pfn_32bit;
 }
 
@@ -162,7 +164,7 @@ move_left:
 	if (!curr) {
 		if (size_aligned)
 			pad_size = iova_get_pad_size(size, limit_pfn);
-		if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
+		if ((iovad->start_pfn + size + pad_size) > limit_pfn) {
 			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
 			return -ENOMEM;
 		}
@@ -237,8 +239,8 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova)
  * @size: - size of page frames to allocate
  * @limit_pfn: - max limit address
  * @size_aligned: - set if size_aligned address range is required
- * This function allocates an iova in the range limit_pfn to IOVA_START_PFN
- * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned
+ * This function allocates an iova in the range iovad->start_pfn to limit_pfn,
+ * searching top-down from limit_pfn to iovad->start_pfn. If the size_aligned
  * flag is set then the allocated address iova->pfn_lo will be naturally
  * aligned on roundup_power_of_two(size).
  */
diff --git a/include/linux/iova.h b/include/linux/iova.h
index ad0507c61cc7..591b19626b46 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -16,9 +16,6 @@
 #include <linux/rbtree.h>
 #include <linux/dma-mapping.h>
 
-/* IO virtual address start page frame number */
-#define IOVA_START_PFN		(1)
-
 /* iova structure */
 struct iova {
 	struct rb_node	node;
@@ -31,6 +28,7 @@ struct iova_domain {
 	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
 	struct rb_root	rbroot;		/* iova domain rbtree root */
 	struct rb_node	*cached32_node; /* Save last alloced node */
+	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
 };
 
@@ -52,7 +50,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
-void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit);
+void init_iova_domain(struct iova_domain *iovad, unsigned long start_pfn,
+	unsigned long pfn_32bit);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
-- 
cgit v1.2.3


From 0fb5fe874c42942e16c450ae05da453e13a1c09e Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 12 Jan 2015 17:51:16 +0000
Subject: iommu: Make IOVA domain page size explicit

Systems may contain heterogeneous IOMMUs supporting differing minimum
page sizes, which may also not be common with the CPU page size.
Thus it is practical to have an explicit notion of IOVA granularity
to simplify handling of mapping and allocation constraints.

As an initial step, move the IOVA page granularity from an implicit
compile-time constant to a per-domain property so we can make use
of it in IOVA domain context at runtime. To keep the abstraction tidy,
extend the little API of inline iova_* helpers to parallel some of the
equivalent PAGE_* macros.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c |  9 ++++++---
 drivers/iommu/iova.c        | 12 ++++++++++--
 include/linux/iova.h        | 35 +++++++++++++++++++++++++++++++++--
 3 files changed, 49 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 86f9e82b015b..ae4c1a854e57 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1635,7 +1635,8 @@ static int dmar_init_reserved_ranges(void)
 	struct iova *iova;
 	int i;
 
-	init_iova_domain(&reserved_iova_list, IOVA_START_PFN, DMA_32BIT_PFN);
+	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
+			DMA_32BIT_PFN);
 
 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
 		&reserved_rbtree_key);
@@ -1693,7 +1694,8 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
 	int adjust_width, agaw;
 	unsigned long sagaw;
 
-	init_iova_domain(&domain->iovad, IOVA_START_PFN, DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
+			DMA_32BIT_PFN);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
@@ -4316,7 +4318,8 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 {
 	int adjust_width;
 
-	init_iova_domain(&domain->iovad, IOVA_START_PFN, DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
+			DMA_32BIT_PFN);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index a3dbba8caa19..9dd8208312c2 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -55,12 +55,20 @@ void free_iova_mem(struct iova *iova)
 }
 
 void
-init_iova_domain(struct iova_domain *iovad, unsigned long start_pfn,
-	unsigned long pfn_32bit)
+init_iova_domain(struct iova_domain *iovad, unsigned long granule,
+	unsigned long start_pfn, unsigned long pfn_32bit)
 {
+	/*
+	 * IOVA granularity will normally be equal to the smallest
+	 * supported IOMMU page size; both *must* be capable of
+	 * representing individual CPU pages exactly.
+	 */
+	BUG_ON((granule > PAGE_SIZE) || !is_power_of_2(granule));
+
 	spin_lock_init(&iovad->iova_rbtree_lock);
 	iovad->rbroot = RB_ROOT;
 	iovad->cached32_node = NULL;
+	iovad->granule = granule;
 	iovad->start_pfn = start_pfn;
 	iovad->dma_32bit_pfn = pfn_32bit;
 }
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 591b19626b46..3920a19d8194 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -28,6 +28,7 @@ struct iova_domain {
 	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
 	struct rb_root	rbroot;		/* iova domain rbtree root */
 	struct rb_node	*cached32_node; /* Save last alloced node */
+	unsigned long	granule;	/* pfn granularity for this domain */
 	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
 };
@@ -37,6 +38,36 @@ static inline unsigned long iova_size(struct iova *iova)
 	return iova->pfn_hi - iova->pfn_lo + 1;
 }
 
+static inline unsigned long iova_shift(struct iova_domain *iovad)
+{
+	return __ffs(iovad->granule);
+}
+
+static inline unsigned long iova_mask(struct iova_domain *iovad)
+{
+	return iovad->granule - 1;
+}
+
+static inline size_t iova_offset(struct iova_domain *iovad, dma_addr_t iova)
+{
+	return iova & iova_mask(iovad);
+}
+
+static inline size_t iova_align(struct iova_domain *iovad, size_t size)
+{
+	return ALIGN(size, iovad->granule);
+}
+
+static inline dma_addr_t iova_dma_addr(struct iova_domain *iovad, struct iova *iova)
+{
+	return (dma_addr_t)iova->pfn_lo << iova_shift(iovad);
+}
+
+static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
+{
+	return iova >> iova_shift(iovad);
+}
+
 int iommu_iova_cache_init(void);
 void iommu_iova_cache_destroy(void);
 
@@ -50,8 +81,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
-void init_iova_domain(struct iova_domain *iovad, unsigned long start_pfn,
-	unsigned long pfn_32bit);
+void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
+	unsigned long start_pfn, unsigned long pfn_32bit);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
-- 
cgit v1.2.3


From 54c523127bcca986c6f9b04c7b56a949ea011899 Mon Sep 17 00:00:00 2001
From: Matt Wagantall <mattw@codeaurora.org>
Date: Mon, 15 Dec 2014 23:47:23 +0000
Subject: iopoll: Introduce memory-mapped IO polling macros

It is sometimes necessary to poll a memory-mapped register until its value
satisfies some condition. Introduce a family of convenience macros that do
this. Tight-looping, sleeping, and timing out can all be accomplished using
these macros.

Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Robert Elliott <elliott@hp.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Matt Wagantall <mattw@codeaurora.org>
Signed-off-by: Mitchel Humpherys <mitchelh@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/iopoll.h | 144 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 include/linux/iopoll.h

(limited to 'include/linux')

diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h
new file mode 100644
index 000000000000..1c30014ed176
--- /dev/null
+++ b/include/linux/iopoll.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2012-2014 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_IOPOLL_H
+#define _LINUX_IOPOLL_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/hrtimer.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+
+/**
+ * readx_poll_timeout - Periodically poll an address until a condition is met or a timeout occurs
+ * @op: accessor function (takes @addr as its only argument)
+ * @addr: Address to poll
+ * @val: Variable to read the value into
+ * @cond: Break condition (usually involving @val)
+ * @sleep_us: Maximum time to sleep between reads in us (0
+ *            tight-loops).  Should be less than ~20ms since usleep_range
+ *            is used (see Documentation/timers/timers-howto.txt).
+ * @timeout_us: Timeout in us, 0 means never timeout
+ *
+ * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
+ * case, the last read value at @addr is stored in @val. Must not
+ * be called from atomic context if sleep_us or timeout_us are used.
+ *
+ * When available, you'll probably want to use one of the specialized
+ * macros defined below rather than this macro directly.
+ */
+#define readx_poll_timeout(op, addr, val, cond, sleep_us, timeout_us)	\
+({ \
+	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
+	might_sleep_if(sleep_us); \
+	for (;;) { \
+		(val) = op(addr); \
+		if (cond) \
+			break; \
+		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+			(val) = op(addr); \
+			break; \
+		} \
+		if (sleep_us) \
+			usleep_range((sleep_us >> 2) + 1, sleep_us); \
+	} \
+	(cond) ? 0 : -ETIMEDOUT; \
+})
+
+/**
+ * readx_poll_timeout_atomic - Periodically poll an address until a condition is met or a timeout occurs
+ * @op: accessor function (takes @addr as its only argument)
+ * @addr: Address to poll
+ * @val: Variable to read the value into
+ * @cond: Break condition (usually involving @val)
+ * @delay_us: Time to udelay between reads in us (0 tight-loops).  Should
+ *            be less than ~10us since udelay is used (see
+ *            Documentation/timers/timers-howto.txt).
+ * @timeout_us: Timeout in us, 0 means never timeout
+ *
+ * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
+ * case, the last read value at @addr is stored in @val.
+ *
+ * When available, you'll probably want to use one of the specialized
+ * macros defined below rather than this macro directly.
+ */
+#define readx_poll_timeout_atomic(op, addr, val, cond, delay_us, timeout_us) \
+({ \
+	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
+	for (;;) { \
+		(val) = op(addr); \
+		if (cond) \
+			break; \
+		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+			(val) = op(addr); \
+			break; \
+		} \
+		if (delay_us) \
+			udelay(delay_us);	\
+	} \
+	(cond) ? 0 : -ETIMEDOUT; \
+})
+
+
+#define readb_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readb, addr, val, cond, delay_us, timeout_us)
+
+#define readb_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readb, addr, val, cond, delay_us, timeout_us)
+
+#define readw_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readw, addr, val, cond, delay_us, timeout_us)
+
+#define readw_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readw, addr, val, cond, delay_us, timeout_us)
+
+#define readl_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readl, addr, val, cond, delay_us, timeout_us)
+
+#define readl_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readl, addr, val, cond, delay_us, timeout_us)
+
+#define readq_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readq, addr, val, cond, delay_us, timeout_us)
+
+#define readq_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readq, addr, val, cond, delay_us, timeout_us)
+
+#define readb_relaxed_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readb_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readb_relaxed_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readb_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readw_relaxed_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readw_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readw_relaxed_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readw_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readl_relaxed_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readl_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readl_relaxed_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readl_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readq_relaxed_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readq_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readq_relaxed_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readq_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#endif /* _LINUX_IOPOLL_H */
-- 
cgit v1.2.3


From 379dcfb406002d855fa56f9c3d290c4048f44e9c Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 19 Jan 2015 00:13:39 +0100
Subject: crypto: doc - remove colons in comments

As documented in Documentation/kernel-doc-nano-HOWTO.txt lines
terminated with a colon are treated as headings.

The current layout of the documentation when compiling the kernel
crypto API DocBook documentation is messed up by by treating some lines
as headings. The patch removes colons from comments that shall not be
treated as headings.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/crypto.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 90998348e564..fb5ef16d6a12 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -1147,7 +1147,7 @@ static inline void ablkcipher_request_free(struct ablkcipher_request *req)
  * cipher operation completes.
  *
  * The callback function is registered with the ablkcipher_request handle and
- * must comply with the following template:
+ * must comply with the following template
  *
  *	void callback_function(struct crypto_async_request *req, int error)
  */
@@ -1174,7 +1174,7 @@ static inline void ablkcipher_request_set_callback(
  *
  * For encryption, the source is treated as the plaintext and the
  * destination is the ciphertext. For a decryption operation, the use is
- * reversed: the source is the ciphertext and the destination is the plaintext.
+ * reversed - the source is the ciphertext and the destination is the plaintext.
  */
 static inline void ablkcipher_request_set_crypt(
 	struct ablkcipher_request *req,
@@ -1509,7 +1509,7 @@ static inline void aead_request_free(struct aead_request *req)
  * completes
  *
  * The callback function is registered with the aead_request handle and
- * must comply with the following template:
+ * must comply with the following template
  *
  *	void callback_function(struct crypto_async_request *req, int error)
  */
@@ -1536,7 +1536,7 @@ static inline void aead_request_set_callback(struct aead_request *req,
  *
  * For encryption, the source is treated as the plaintext and the
  * destination is the ciphertext. For a decryption operation, the use is
- * reversed: the source is the ciphertext and the destination is the plaintext.
+ * reversed - the source is the ciphertext and the destination is the plaintext.
  *
  * IMPORTANT NOTE AEAD requires an authentication tag (MAC). For decryption,
  *		  the caller must concatenate the ciphertext followed by the
-- 
cgit v1.2.3


From 2fded7f44b8fcf79e274c3f0cfbd0298f95308f3 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 23 Dec 2014 16:39:54 -0500
Subject: audit: remove vestiges of vers_ops

Should have been removed with commit 18900909 ("audit: remove the old
depricated kernel interface").

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 include/linux/audit.h | 1 -
 kernel/auditfilter.c  | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 93331929d643..b481779a8dee 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -46,7 +46,6 @@ struct audit_tree;
 struct sk_buff;
 
 struct audit_krule {
-	int			vers_ops;
 	u32			pflags;
 	u32			flags;
 	u32			listnr;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 103586e239a2..81c94d739e3f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -425,7 +425,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		goto exit_nofree;
 
 	bufp = data->buf;
-	entry->rule.vers_ops = 2;
 	for (i = 0; i < data->field_count; i++) {
 		struct audit_field *f = &entry->rule.fields[i];
 
@@ -758,7 +757,6 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
 		return ERR_PTR(-ENOMEM);
 
 	new = &entry->rule;
-	new->vers_ops = old->vers_ops;
 	new->flags = old->flags;
 	new->pflags = old->pflags;
 	new->listnr = old->listnr;
-- 
cgit v1.2.3


From ea2f83a7de9d0abbd145e37177905aab57fdb835 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Sun, 26 Oct 2014 23:17:00 +0000
Subject: arm/arm64: KVM: move kvm_register_device_ops() into vGIC probing

Currently we unconditionally register the GICv2 emulation device
during the host's KVM initialization. Since with GICv3 support we
may end up with only v2 or only v3 or both supported, we move the
registration into the GIC probing function, where we will later know
which combination is valid.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/linux/kvm_host.h | 1 +
 virt/kvm/arm/vgic-v2.c   | 2 ++
 virt/kvm/arm/vgic-v3.c   | 1 +
 virt/kvm/arm/vgic.c      | 5 ++---
 4 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3b934cc94cc8..25d7ce31a5d4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1051,6 +1051,7 @@ void kvm_unregister_device_ops(u32 type);
 
 extern struct kvm_device_ops kvm_mpic_ops;
 extern struct kvm_device_ops kvm_xics_ops;
+extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
index 2935405ad22f..e1cd3cb95903 100644
--- a/virt/kvm/arm/vgic-v2.c
+++ b/virt/kvm/arm/vgic-v2.c
@@ -229,6 +229,8 @@ int vgic_v2_probe(struct device_node *vgic_node,
 		goto out_unmap;
 	}
 
+	kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
+
 	vgic->vcpu_base = vcpu_res.start;
 
 	kvm_info("%s@%llx IRQ%d\n", vgic_node->name,
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
index 1c2c8eef0599..d14c75f4a33b 100644
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -230,6 +230,7 @@ int vgic_v3_probe(struct device_node *vgic_node,
 		ret = -ENXIO;
 		goto out;
 	}
+	kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
 
 	vgic->vcpu_base = vcpu_res.start;
 	vgic->vctrl_base = NULL;
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 9b63141a599d..69f6e7aa573e 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -2564,7 +2564,7 @@ static int vgic_create(struct kvm_device *dev, u32 type)
 	return kvm_vgic_create(dev->kvm, type);
 }
 
-static struct kvm_device_ops kvm_arm_vgic_v2_ops = {
+struct kvm_device_ops kvm_arm_vgic_v2_ops = {
 	.name = "kvm-arm-vgic",
 	.create = vgic_create,
 	.destroy = vgic_destroy,
@@ -2643,8 +2643,7 @@ int kvm_vgic_hyp_init(void)
 
 	on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
 
-	return kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
-				       KVM_DEV_TYPE_ARM_VGIC_V2);
+	return 0;
 
 out_free_irq:
 	free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus());
-- 
cgit v1.2.3


From a0675c25d6392c2197b796a60c4a2a0138c86355 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Sat, 7 Jun 2014 00:54:51 +0200
Subject: arm/arm64: KVM: add virtual GICv3 distributor emulation

With everything separated and prepared, we implement a model of a
GICv3 distributor and redistributors by using the existing framework
to provide handler functions for each register group.

Currently we limit the emulation to a model enforcing a single
security state, with SRE==1 (forcing system register access) and
ARE==1 (allowing more than 8 VCPUs).

We share some of the functions provided for GICv2 emulation, but take
the different ways of addressing (v)CPUs into account.
Save and restore is currently not implemented.

Similar to the split-off of the GICv2 specific code, the new emulation
code goes into a new file (vgic-v3-emul.c).

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/kvm/Makefile            |   1 +
 include/kvm/arm_vgic.h             |   9 +-
 include/linux/irqchip/arm-gic-v3.h |  32 ++
 include/linux/kvm_host.h           |   1 +
 include/uapi/linux/kvm.h           |   2 +
 virt/kvm/arm/vgic-v3-emul.c        | 922 +++++++++++++++++++++++++++++++++++++
 virt/kvm/arm/vgic.c                |  11 +-
 virt/kvm/arm/vgic.h                |   3 +
 8 files changed, 978 insertions(+), 3 deletions(-)
 create mode 100644 virt/kvm/arm/vgic-v3-emul.c

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index d9573533a873..4e6e09ee4033 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -24,5 +24,6 @@ kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o
 kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o
 kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v2-switch.o
 kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3.o
+kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3-emul.o
 kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v3-switch.o
 kvm-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index ff04afd0d901..98c30168bce4 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -162,7 +162,11 @@ struct vgic_dist {
 
 	/* Distributor and vcpu interface mapping in the guest */
 	phys_addr_t		vgic_dist_base;
-	phys_addr_t		vgic_cpu_base;
+	/* GICv2 and GICv3 use different mapped register blocks */
+	union {
+		phys_addr_t		vgic_cpu_base;
+		phys_addr_t		vgic_redist_base;
+	};
 
 	/* Distributor enabled */
 	u32			enabled;
@@ -224,6 +228,9 @@ struct vgic_dist {
 	 */
 	struct vgic_bitmap	*irq_spi_target;
 
+	/* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */
+	u32			*irq_spi_mpidr;
+
 	/* Bitmap indicating which CPU has something pending */
 	unsigned long		*irq_pending_on_cpu;
 
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 1e8b0cf30792..3fb4d8588a26 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -33,6 +33,7 @@
 #define GICD_SETSPI_SR			0x0050
 #define GICD_CLRSPI_SR			0x0058
 #define GICD_SEIR			0x0068
+#define GICD_IGROUPR			0x0080
 #define GICD_ISENABLER			0x0100
 #define GICD_ICENABLER			0x0180
 #define GICD_ISPENDR			0x0200
@@ -41,14 +42,37 @@
 #define GICD_ICACTIVER			0x0380
 #define GICD_IPRIORITYR			0x0400
 #define GICD_ICFGR			0x0C00
+#define GICD_IGRPMODR			0x0D00
+#define GICD_NSACR			0x0E00
 #define GICD_IROUTER			0x6000
+#define GICD_IDREGS			0xFFD0
 #define GICD_PIDR2			0xFFE8
 
+/*
+ * Those registers are actually from GICv2, but the spec demands that they
+ * are implemented as RES0 if ARE is 1 (which we do in KVM's emulated GICv3).
+ */
+#define GICD_ITARGETSR			0x0800
+#define GICD_SGIR			0x0F00
+#define GICD_CPENDSGIR			0x0F10
+#define GICD_SPENDSGIR			0x0F20
+
 #define GICD_CTLR_RWP			(1U << 31)
+#define GICD_CTLR_DS			(1U << 6)
 #define GICD_CTLR_ARE_NS		(1U << 4)
 #define GICD_CTLR_ENABLE_G1A		(1U << 1)
 #define GICD_CTLR_ENABLE_G1		(1U << 0)
 
+/*
+ * In systems with a single security state (what we emulate in KVM)
+ * the meaning of the interrupt group enable bits is slightly different
+ */
+#define GICD_CTLR_ENABLE_SS_G1		(1U << 1)
+#define GICD_CTLR_ENABLE_SS_G0		(1U << 0)
+
+#define GICD_TYPER_LPIS			(1U << 17)
+#define GICD_TYPER_MBIS			(1U << 16)
+
 #define GICD_TYPER_ID_BITS(typer)	((((typer) >> 19) & 0x1f) + 1)
 #define GICD_TYPER_IRQS(typer)		((((typer) & 0x1f) + 1) * 32)
 #define GICD_TYPER_LPIS			(1U << 17)
@@ -60,6 +84,8 @@
 #define GIC_PIDR2_ARCH_GICv3		0x30
 #define GIC_PIDR2_ARCH_GICv4		0x40
 
+#define GIC_V3_DIST_SIZE		0x10000
+
 /*
  * Re-Distributor registers, offsets from RD_base
  */
@@ -78,6 +104,7 @@
 #define GICR_SYNCR			0x00C0
 #define GICR_MOVLPIR			0x0100
 #define GICR_MOVALLR			0x0110
+#define GICR_IDREGS			GICD_IDREGS
 #define GICR_PIDR2			GICD_PIDR2
 
 #define GICR_CTLR_ENABLE_LPIS		(1UL << 0)
@@ -104,6 +131,7 @@
 /*
  * Re-Distributor registers, offsets from SGI_base
  */
+#define GICR_IGROUPR0			GICD_IGROUPR
 #define GICR_ISENABLER0			GICD_ISENABLER
 #define GICR_ICENABLER0			GICD_ICENABLER
 #define GICR_ISPENDR0			GICD_ISPENDR
@@ -112,11 +140,15 @@
 #define GICR_ICACTIVER0			GICD_ICACTIVER
 #define GICR_IPRIORITYR0		GICD_IPRIORITYR
 #define GICR_ICFGR0			GICD_ICFGR
+#define GICR_IGRPMODR0			GICD_IGRPMODR
+#define GICR_NSACR			GICD_NSACR
 
 #define GICR_TYPER_PLPIS		(1U << 0)
 #define GICR_TYPER_VLPIS		(1U << 1)
 #define GICR_TYPER_LAST			(1U << 4)
 
+#define GIC_V3_REDIST_SIZE		0x20000
+
 #define LPI_PROP_GROUP1			(1 << 1)
 #define LPI_PROP_ENABLED		(1 << 0)
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 25d7ce31a5d4..0ef2daa199d8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1052,6 +1052,7 @@ void kvm_unregister_device_ops(u32 type);
 extern struct kvm_device_ops kvm_mpic_ops;
 extern struct kvm_device_ops kvm_xics_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
+extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a37fd1224f36..b4e6f1e70f03 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -952,6 +952,8 @@ enum kvm_device_type {
 #define KVM_DEV_TYPE_ARM_VGIC_V2	KVM_DEV_TYPE_ARM_VGIC_V2
 	KVM_DEV_TYPE_FLIC,
 #define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
+	KVM_DEV_TYPE_ARM_VGIC_V3,
+#define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
 	KVM_DEV_TYPE_MAX,
 };
 
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
new file mode 100644
index 000000000000..8db1db597223
--- /dev/null
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -0,0 +1,922 @@
+/*
+ * GICv3 distributor and redistributor emulation
+ *
+ * GICv3 emulation is currently only supported on a GICv3 host (because
+ * we rely on the hardware's CPU interface virtualization support), but
+ * supports both hardware with or without the optional GICv2 backwards
+ * compatibility features.
+ *
+ * Limitations of the emulation:
+ * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore)
+ * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI.
+ * - We do not support the message based interrupts (MBIs) triggered by
+ *   writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0.
+ * - We do not support the (optional) backwards compatibility feature.
+ *   GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports
+ *   the compatiblity feature, you can use a GICv2 in the guest, though.
+ * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI.
+ * - Priorities are not emulated (same as the GICv2 emulation). Linux
+ *   as a guest is fine with this, because it does not use priorities.
+ * - We only support Group1 interrupts. Again Linux uses only those.
+ *
+ * Copyright (C) 2014 ARM Ltd.
+ * Author: Andre Przywara <andre.przywara@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+#include <kvm/arm_vgic.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+
+static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu,
+			       struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+	u32 reg = 0xffffffff;
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+
+	return false;
+}
+
+static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
+			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+	u32 reg = 0;
+
+	/*
+	 * Force ARE and DS to 1, the guest cannot change this.
+	 * For the time being we only support Group1 interrupts.
+	 */
+	if (vcpu->kvm->arch.vgic.enabled)
+		reg = GICD_CTLR_ENABLE_SS_G1;
+	reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+	if (mmio->is_write) {
+		if (reg & GICD_CTLR_ENABLE_SS_G0)
+			kvm_info("guest tried to enable unsupported Group0 interrupts\n");
+		vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
+		vgic_update_state(vcpu->kvm);
+		return true;
+	}
+	return false;
+}
+
+/*
+ * As this implementation does not provide compatibility
+ * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
+ * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
+ * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
+ */
+#define INTERRUPT_ID_BITS 10
+static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
+			      struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+	u32 reg;
+
+	reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
+
+	reg |= (INTERRUPT_ID_BITS - 1) << 19;
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+
+	return false;
+}
+
+static bool handle_mmio_iidr(struct kvm_vcpu *vcpu,
+			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+	u32 reg;
+
+	reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+
+	return false;
+}
+
+static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu,
+					    struct kvm_exit_mmio *mmio,
+					    phys_addr_t offset)
+{
+	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+		return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+					      vcpu->vcpu_id,
+					      ACCESS_WRITE_SETBIT);
+
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu,
+					      struct kvm_exit_mmio *mmio,
+					      phys_addr_t offset)
+{
+	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+		return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+					      vcpu->vcpu_id,
+					      ACCESS_WRITE_CLEARBIT);
+
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu,
+					     struct kvm_exit_mmio *mmio,
+					     phys_addr_t offset)
+{
+	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+		return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
+						   vcpu->vcpu_id);
+
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
+					       struct kvm_exit_mmio *mmio,
+					       phys_addr_t offset)
+{
+	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+		return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
+						     vcpu->vcpu_id);
+
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
+					  struct kvm_exit_mmio *mmio,
+					  phys_addr_t offset)
+{
+	u32 *reg;
+
+	if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) {
+		vgic_reg_access(mmio, NULL, offset,
+				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+		return false;
+	}
+
+	reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
+				   vcpu->vcpu_id, offset);
+	vgic_reg_access(mmio, reg, offset,
+		ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+	return false;
+}
+
+static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu,
+				     struct kvm_exit_mmio *mmio,
+				     phys_addr_t offset)
+{
+	u32 *reg;
+
+	if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) {
+		vgic_reg_access(mmio, NULL, offset,
+				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+		return false;
+	}
+
+	reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+				  vcpu->vcpu_id, offset >> 1);
+
+	return vgic_handle_cfg_reg(reg, mmio, offset);
+}
+
+/*
+ * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word)
+ * when we store the target MPIDR written by the guest.
+ */
+static u32 compress_mpidr(unsigned long mpidr)
+{
+	u32 ret;
+
+	ret = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8;
+	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16;
+	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24;
+
+	return ret;
+}
+
+static unsigned long uncompress_mpidr(u32 value)
+{
+	unsigned long mpidr;
+
+	mpidr  = ((value >>  0) & 0xFF) << MPIDR_LEVEL_SHIFT(0);
+	mpidr |= ((value >>  8) & 0xFF) << MPIDR_LEVEL_SHIFT(1);
+	mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2);
+	mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3);
+
+	return mpidr;
+}
+
+/*
+ * Lookup the given MPIDR value to get the vcpu_id (if there is one)
+ * and store that in the irq_spi_cpu[] array.
+ * This limits the number of VCPUs to 255 for now, extending the data
+ * type (or storing kvm_vcpu pointers) should lift the limit.
+ * Store the original MPIDR value in an extra array to support read-as-written.
+ * Unallocated MPIDRs are translated to a special value and caught
+ * before any array accesses.
+ */
+static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
+				  struct kvm_exit_mmio *mmio,
+				  phys_addr_t offset)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	int spi;
+	u32 reg;
+	int vcpu_id;
+	unsigned long *bmap, mpidr;
+
+	/*
+	 * The upper 32 bits of each 64 bit register are zero,
+	 * as we don't support Aff3.
+	 */
+	if ((offset & 4)) {
+		vgic_reg_access(mmio, NULL, offset,
+				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+		return false;
+	}
+
+	/* This region only covers SPIs, so no handling of private IRQs here. */
+	spi = offset / 8;
+
+	/* get the stored MPIDR for this IRQ */
+	mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]);
+	reg = mpidr;
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+
+	if (!mmio->is_write)
+		return false;
+
+	/*
+	 * Now clear the currently assigned vCPU from the map, making room
+	 * for the new one to be written below
+	 */
+	vcpu = kvm_mpidr_to_vcpu(kvm, mpidr);
+	if (likely(vcpu)) {
+		vcpu_id = vcpu->vcpu_id;
+		bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
+		__clear_bit(spi, bmap);
+	}
+
+	dist->irq_spi_mpidr[spi] = compress_mpidr(reg);
+	vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK);
+
+	/*
+	 * The spec says that non-existent MPIDR values should not be
+	 * forwarded to any existent (v)CPU, but should be able to become
+	 * pending anyway. We simply keep the irq_spi_target[] array empty, so
+	 * the interrupt will never be injected.
+	 * irq_spi_cpu[irq] gets a magic value in this case.
+	 */
+	if (likely(vcpu)) {
+		vcpu_id = vcpu->vcpu_id;
+		dist->irq_spi_cpu[spi] = vcpu_id;
+		bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
+		__set_bit(spi, bmap);
+	} else {
+		dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED;
+	}
+
+	vgic_update_state(kvm);
+
+	return true;
+}
+
+/*
+ * We should be careful about promising too much when a guest reads
+ * this register. Don't claim to be like any hardware implementation,
+ * but just report the GIC as version 3 - which is what a Linux guest
+ * would check.
+ */
+static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
+			       struct kvm_exit_mmio *mmio,
+			       phys_addr_t offset)
+{
+	u32 reg = 0;
+
+	switch (offset + GICD_IDREGS) {
+	case GICD_PIDR2:
+		reg = 0x3b;
+		break;
+	}
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+
+	return false;
+}
+
+static const struct kvm_mmio_range vgic_v3_dist_ranges[] = {
+	{
+		.base           = GICD_CTLR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_ctlr,
+	},
+	{
+		.base           = GICD_TYPER,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_typer,
+	},
+	{
+		.base           = GICD_IIDR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_iidr,
+	},
+	{
+		/* this register is optional, it is RAZ/WI if not implemented */
+		.base           = GICD_STATUSR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_raz_wi,
+	},
+	{
+		/* this write only register is WI when TYPER.MBIS=0 */
+		.base		= GICD_SETSPI_NSR,
+		.len		= 0x04,
+		.bits_per_irq	= 0,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this write only register is WI when TYPER.MBIS=0 */
+		.base		= GICD_CLRSPI_NSR,
+		.len		= 0x04,
+		.bits_per_irq	= 0,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when DS=1 */
+		.base		= GICD_SETSPI_SR,
+		.len		= 0x04,
+		.bits_per_irq	= 0,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when DS=1 */
+		.base		= GICD_CLRSPI_SR,
+		.len		= 0x04,
+		.bits_per_irq	= 0,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_IGROUPR,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_rao_wi,
+	},
+	{
+		.base		= GICD_ISENABLER,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_set_enable_reg_dist,
+	},
+	{
+		.base		= GICD_ICENABLER,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_clear_enable_reg_dist,
+	},
+	{
+		.base		= GICD_ISPENDR,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_set_pending_reg_dist,
+	},
+	{
+		.base		= GICD_ICPENDR,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_clear_pending_reg_dist,
+	},
+	{
+		.base		= GICD_ISACTIVER,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_ICACTIVER,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_IPRIORITYR,
+		.len		= 0x400,
+		.bits_per_irq	= 8,
+		.handle_mmio	= handle_mmio_priority_reg_dist,
+	},
+	{
+		/* TARGETSRn is RES0 when ARE=1 */
+		.base		= GICD_ITARGETSR,
+		.len		= 0x400,
+		.bits_per_irq	= 8,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_ICFGR,
+		.len		= 0x100,
+		.bits_per_irq	= 2,
+		.handle_mmio	= handle_mmio_cfg_reg_dist,
+	},
+	{
+		/* this is RAZ/WI when DS=1 */
+		.base		= GICD_IGRPMODR,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when DS=1 */
+		.base		= GICD_NSACR,
+		.len		= 0x100,
+		.bits_per_irq	= 2,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when ARE=1 */
+		.base		= GICD_SGIR,
+		.len		= 0x04,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when ARE=1 */
+		.base		= GICD_CPENDSGIR,
+		.len		= 0x10,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when ARE=1 */
+		.base           = GICD_SPENDSGIR,
+		.len            = 0x10,
+		.handle_mmio    = handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_IROUTER + 0x100,
+		.len		= 0x1ee0,
+		.bits_per_irq	= 64,
+		.handle_mmio	= handle_mmio_route_reg,
+	},
+	{
+		.base           = GICD_IDREGS,
+		.len            = 0x30,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_idregs,
+	},
+	{},
+};
+
+static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
+					      struct kvm_exit_mmio *mmio,
+					      phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+				      redist_vcpu->vcpu_id,
+				      ACCESS_WRITE_SETBIT);
+}
+
+static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
+						struct kvm_exit_mmio *mmio,
+						phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+				      redist_vcpu->vcpu_id,
+				      ACCESS_WRITE_CLEARBIT);
+}
+
+static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
+					       struct kvm_exit_mmio *mmio,
+					       phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
+					   redist_vcpu->vcpu_id);
+}
+
+static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu,
+						 struct kvm_exit_mmio *mmio,
+						 phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
+					     redist_vcpu->vcpu_id);
+}
+
+static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu,
+					    struct kvm_exit_mmio *mmio,
+					    phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+	u32 *reg;
+
+	reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
+				   redist_vcpu->vcpu_id, offset);
+	vgic_reg_access(mmio, reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+	return false;
+}
+
+static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
+				       struct kvm_exit_mmio *mmio,
+				       phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+				       redist_vcpu->vcpu_id, offset >> 1);
+
+	return vgic_handle_cfg_reg(reg, mmio, offset);
+}
+
+static const struct kvm_mmio_range vgic_redist_sgi_ranges[] = {
+	{
+		.base		= GICR_IGROUPR0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_rao_wi,
+	},
+	{
+		.base		= GICR_ISENABLER0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_set_enable_reg_redist,
+	},
+	{
+		.base		= GICR_ICENABLER0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_clear_enable_reg_redist,
+	},
+	{
+		.base		= GICR_ISPENDR0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_set_pending_reg_redist,
+	},
+	{
+		.base		= GICR_ICPENDR0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_clear_pending_reg_redist,
+	},
+	{
+		.base		= GICR_ISACTIVER0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICR_ICACTIVER0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICR_IPRIORITYR0,
+		.len		= 0x20,
+		.bits_per_irq	= 8,
+		.handle_mmio	= handle_mmio_priority_reg_redist,
+	},
+	{
+		.base		= GICR_ICFGR0,
+		.len		= 0x08,
+		.bits_per_irq	= 2,
+		.handle_mmio	= handle_mmio_cfg_reg_redist,
+	},
+	{
+		.base		= GICR_IGRPMODR0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICR_NSACR,
+		.len		= 0x04,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{},
+};
+
+static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
+				    struct kvm_exit_mmio *mmio,
+				    phys_addr_t offset)
+{
+	/* since we don't support LPIs, this register is zero for now */
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
+				     struct kvm_exit_mmio *mmio,
+				     phys_addr_t offset)
+{
+	u32 reg;
+	u64 mpidr;
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+	int target_vcpu_id = redist_vcpu->vcpu_id;
+
+	/* the upper 32 bits contain the affinity value */
+	if ((offset & ~3) == 4) {
+		mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
+		reg = compress_mpidr(mpidr);
+
+		vgic_reg_access(mmio, &reg, offset,
+				ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+		return false;
+	}
+
+	reg = redist_vcpu->vcpu_id << 8;
+	if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
+		reg |= GICR_TYPER_LAST;
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static const struct kvm_mmio_range vgic_redist_ranges[] = {
+	{
+		.base           = GICR_CTLR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_ctlr_redist,
+	},
+	{
+		.base           = GICR_TYPER,
+		.len            = 0x08,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_typer_redist,
+	},
+	{
+		.base           = GICR_IIDR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_iidr,
+	},
+	{
+		.base           = GICR_WAKER,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_raz_wi,
+	},
+	{
+		.base           = GICR_IDREGS,
+		.len            = 0x30,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_idregs,
+	},
+	{},
+};
+
+/*
+ * This function splits accesses between the distributor and the two
+ * redistributor parts (private/SPI). As each redistributor is accessible
+ * from any CPU, we have to determine the affected VCPU by taking the faulting
+ * address into account. We then pass this VCPU to the handler function via
+ * the private parameter.
+ */
+#define SGI_BASE_OFFSET SZ_64K
+static bool vgic_v3_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
+				struct kvm_exit_mmio *mmio)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	unsigned long dbase = dist->vgic_dist_base;
+	unsigned long rdbase = dist->vgic_redist_base;
+	int nrcpus = atomic_read(&vcpu->kvm->online_vcpus);
+	int vcpu_id;
+	const struct kvm_mmio_range *mmio_range;
+
+	if (is_in_range(mmio->phys_addr, mmio->len, dbase, GIC_V3_DIST_SIZE)) {
+		return vgic_handle_mmio_range(vcpu, run, mmio,
+					      vgic_v3_dist_ranges, dbase);
+	}
+
+	if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
+	    GIC_V3_REDIST_SIZE * nrcpus))
+		return false;
+
+	vcpu_id = (mmio->phys_addr - rdbase) / GIC_V3_REDIST_SIZE;
+	rdbase += (vcpu_id * GIC_V3_REDIST_SIZE);
+	mmio->private = kvm_get_vcpu(vcpu->kvm, vcpu_id);
+
+	if (mmio->phys_addr >= rdbase + SGI_BASE_OFFSET) {
+		rdbase += SGI_BASE_OFFSET;
+		mmio_range = vgic_redist_sgi_ranges;
+	} else {
+		mmio_range = vgic_redist_ranges;
+	}
+	return vgic_handle_mmio_range(vcpu, run, mmio, mmio_range, rdbase);
+}
+
+static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
+{
+	if (vgic_queue_irq(vcpu, 0, irq)) {
+		vgic_dist_irq_clear_pending(vcpu, irq);
+		vgic_cpu_irq_clear(vcpu, irq);
+		return true;
+	}
+
+	return false;
+}
+
+static int vgic_v3_map_resources(struct kvm *kvm,
+				 const struct vgic_params *params)
+{
+	int ret = 0;
+	struct vgic_dist *dist = &kvm->arch.vgic;
+
+	if (!irqchip_in_kernel(kvm))
+		return 0;
+
+	mutex_lock(&kvm->lock);
+
+	if (vgic_ready(kvm))
+		goto out;
+
+	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
+	    IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
+		kvm_err("Need to set vgic distributor addresses first\n");
+		ret = -ENXIO;
+		goto out;
+	}
+
+	/*
+	 * For a VGICv3 we require the userland to explicitly initialize
+	 * the VGIC before we need to use it.
+	 */
+	if (!vgic_initialized(kvm)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	kvm->arch.vgic.ready = true;
+out:
+	if (ret)
+		kvm_vgic_destroy(kvm);
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
+static int vgic_v3_init_model(struct kvm *kvm)
+{
+	int i;
+	u32 mpidr;
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
+
+	dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]),
+				      GFP_KERNEL);
+
+	if (!dist->irq_spi_mpidr)
+		return -ENOMEM;
+
+	/* Initialize the target VCPUs for each IRQ to VCPU 0 */
+	mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0)));
+	for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) {
+		dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0;
+		dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr;
+		vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1);
+	}
+
+	return 0;
+}
+
+/* GICv3 does not keep track of SGI sources anymore. */
+static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
+{
+}
+
+void vgic_v3_init_emulation(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+
+	dist->vm_ops.handle_mmio = vgic_v3_handle_mmio;
+	dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
+	dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
+	dist->vm_ops.init_model = vgic_v3_init_model;
+	dist->vm_ops.map_resources = vgic_v3_map_resources;
+
+	kvm->arch.max_vcpus = KVM_MAX_VCPUS;
+}
+
+static int vgic_v3_create(struct kvm_device *dev, u32 type)
+{
+	return kvm_vgic_create(dev->kvm, type);
+}
+
+static void vgic_v3_destroy(struct kvm_device *dev)
+{
+	kfree(dev);
+}
+
+static int vgic_v3_set_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	int ret;
+
+	ret = vgic_set_common_attr(dev, attr);
+	if (ret != -ENXIO)
+		return ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+		return -ENXIO;
+	}
+
+	return -ENXIO;
+}
+
+static int vgic_v3_get_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	int ret;
+
+	ret = vgic_get_common_attr(dev, attr);
+	if (ret != -ENXIO)
+		return ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+		return -ENXIO;
+	}
+
+	return -ENXIO;
+}
+
+static int vgic_v3_has_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR:
+		switch (attr->attr) {
+		case KVM_VGIC_V2_ADDR_TYPE_DIST:
+		case KVM_VGIC_V2_ADDR_TYPE_CPU:
+			return -ENXIO;
+		}
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+		return -ENXIO;
+	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+		return 0;
+	case KVM_DEV_ARM_VGIC_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_ARM_VGIC_CTRL_INIT:
+			return 0;
+		}
+	}
+	return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_vgic_v3_ops = {
+	.name = "kvm-arm-vgic-v3",
+	.create = vgic_v3_create,
+	.destroy = vgic_v3_destroy,
+	.set_attr = vgic_v3_set_attr,
+	.get_attr = vgic_v3_get_attr,
+	.has_attr = vgic_v3_has_attr,
+};
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index b6e17c886ce2..6d23e57c3561 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1249,7 +1249,7 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 	struct kvm_vcpu *vcpu;
 	int edge_triggered, level_triggered;
 	int enabled;
-	bool ret = true;
+	bool ret = true, can_inject = true;
 
 	spin_lock(&dist->lock);
 
@@ -1264,6 +1264,11 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 
 	if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
 		cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
+		if (cpuid == VCPU_NOT_ALLOCATED) {
+			/* Pretend we use CPU0, and prevent injection */
+			cpuid = 0;
+			can_inject = false;
+		}
 		vcpu = kvm_get_vcpu(kvm, cpuid);
 	}
 
@@ -1286,7 +1291,7 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 
 	enabled = vgic_irq_is_enabled(vcpu, irq_num);
 
-	if (!enabled) {
+	if (!enabled || !can_inject) {
 		ret = false;
 		goto out;
 	}
@@ -1439,6 +1444,7 @@ void kvm_vgic_destroy(struct kvm *kvm)
 	}
 	kfree(dist->irq_sgi_sources);
 	kfree(dist->irq_spi_cpu);
+	kfree(dist->irq_spi_mpidr);
 	kfree(dist->irq_spi_target);
 	kfree(dist->irq_pending_on_cpu);
 	dist->irq_sgi_sources = NULL;
@@ -1594,6 +1600,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
 	kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
 	kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
 	kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
+	kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
 
 out_unlock:
 	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
index e363b9341873..1e83bdf5f499 100644
--- a/virt/kvm/arm/vgic.h
+++ b/virt/kvm/arm/vgic.h
@@ -35,6 +35,8 @@
 #define ACCESS_WRITE_VALUE	(3 << 1)
 #define ACCESS_WRITE_MASK(x)	((x) & (3 << 1))
 
+#define VCPU_NOT_ALLOCATED	((u8)-1)
+
 unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x);
 
 void vgic_update_state(struct kvm *kvm);
@@ -116,5 +118,6 @@ int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
 
 int vgic_init(struct kvm *kvm);
 void vgic_v2_init_emulation(struct kvm *kvm);
+void vgic_v3_init_emulation(struct kvm *kvm);
 
 #endif
-- 
cgit v1.2.3


From 7e5802781c3e109558ddfd8b02155ad24d872ee7 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Wed, 12 Nov 2014 13:46:06 +0000
Subject: arm64: GICv3: introduce symbolic names for GICv3 ICC_SGI1R_EL1 fields

The gic_send_sgi() function used hardcoded bit shift values to
generate the ICC_SGI1R_EL1 register value.
Replace this with symbolic names to allow reusing them later.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 drivers/irqchip/irq-gic-v3.c       | 14 +++++++++-----
 include/linux/irqchip/arm-gic-v3.h | 12 ++++++++++++
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 1a146ccee701..2ab290bec655 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -481,15 +481,19 @@ out:
 	return tlist;
 }
 
+#define MPIDR_TO_SGI_AFFINITY(cluster_id, level) \
+	(MPIDR_AFFINITY_LEVEL(cluster_id, level) \
+		<< ICC_SGI1R_AFFINITY_## level ##_SHIFT)
+
 static void gic_send_sgi(u64 cluster_id, u16 tlist, unsigned int irq)
 {
 	u64 val;
 
-	val = (MPIDR_AFFINITY_LEVEL(cluster_id, 3) << 48	|
-	       MPIDR_AFFINITY_LEVEL(cluster_id, 2) << 32	|
-	       irq << 24			    		|
-	       MPIDR_AFFINITY_LEVEL(cluster_id, 1) << 16	|
-	       tlist);
+	val = (MPIDR_TO_SGI_AFFINITY(cluster_id, 3)	|
+	       MPIDR_TO_SGI_AFFINITY(cluster_id, 2)	|
+	       irq << ICC_SGI1R_SGI_ID_SHIFT		|
+	       MPIDR_TO_SGI_AFFINITY(cluster_id, 1)	|
+	       tlist << ICC_SGI1R_TARGET_LIST_SHIFT);
 
 	pr_debug("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val);
 	gic_write_sgi1r(val);
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 3fb4d8588a26..800544bc7bfd 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -280,6 +280,18 @@
 #define ICC_SRE_EL2_SRE			(1 << 0)
 #define ICC_SRE_EL2_ENABLE		(1 << 3)
 
+#define ICC_SGI1R_TARGET_LIST_SHIFT	0
+#define ICC_SGI1R_TARGET_LIST_MASK	(0xffff << ICC_SGI1R_TARGET_LIST_SHIFT)
+#define ICC_SGI1R_AFFINITY_1_SHIFT	16
+#define ICC_SGI1R_AFFINITY_1_MASK	(0xff << ICC_SGI1R_AFFINITY_1_SHIFT)
+#define ICC_SGI1R_SGI_ID_SHIFT		24
+#define ICC_SGI1R_SGI_ID_MASK		(0xff << ICC_SGI1R_SGI_ID_SHIFT)
+#define ICC_SGI1R_AFFINITY_2_SHIFT	32
+#define ICC_SGI1R_AFFINITY_2_MASK	(0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
+#define ICC_SGI1R_IRQ_ROUTING_MODE_BIT	40
+#define ICC_SGI1R_AFFINITY_3_SHIFT	48
+#define ICC_SGI1R_AFFINITY_3_MASK	(0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
+
 /*
  * System register definitions
  */
-- 
cgit v1.2.3


From 97b713ba3ebaa6c8d84c2c720f5468a7c6a6eb4e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:31 +0100
Subject: fs: kill BDI_CAP_SWAP_BACKED

This bdi flag isn't too useful - we can determine that a vma is backed by
either swap or shmem trivially in the caller.

This also allows removing the backing_dev_info instaces for swap and shmem
in favor of noop_backing_dev_info.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev.h | 13 -------------
 mm/madvise.c                | 17 ++++++++++-------
 mm/shmem.c                  | 25 +++++++------------------
 mm/swap.c                   |  2 --
 mm/swap_state.c             |  7 +------
 5 files changed, 18 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5da6012b7a14..e936cea856dc 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -238,8 +238,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_WRITE_MAP:      Can be mapped for writing
  * BDI_CAP_EXEC_MAP:       Can be mapped for execution
  *
- * BDI_CAP_SWAP_BACKED:    Count shmem/tmpfs objects as swap-backed.
- *
  * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
@@ -250,7 +248,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_WRITE_MAP	0x00000020
 #define BDI_CAP_EXEC_MAP	0x00000040
 #define BDI_CAP_NO_ACCT_WB	0x00000080
-#define BDI_CAP_SWAP_BACKED	0x00000100
 #define BDI_CAP_STABLE_WRITES	0x00000200
 #define BDI_CAP_STRICTLIMIT	0x00000400
 
@@ -329,11 +326,6 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
 				      BDI_CAP_NO_WRITEBACK));
 }
 
-static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
-{
-	return bdi->capabilities & BDI_CAP_SWAP_BACKED;
-}
-
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
@@ -344,11 +336,6 @@ static inline bool mapping_cap_account_dirty(struct address_space *mapping)
 	return bdi_cap_account_dirty(mapping->backing_dev_info);
 }
 
-static inline bool mapping_cap_swap_backed(struct address_space *mapping)
-{
-	return bdi_cap_swap_backed(mapping->backing_dev_info);
-}
-
 static inline int bdi_sched_wait(void *word)
 {
 	schedule();
diff --git a/mm/madvise.c b/mm/madvise.c
index a271adc93289..1383a8916bc3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -222,19 +222,22 @@ static long madvise_willneed(struct vm_area_struct *vma,
 	struct file *file = vma->vm_file;
 
 #ifdef CONFIG_SWAP
-	if (!file || mapping_cap_swap_backed(file->f_mapping)) {
+	if (!file) {
 		*prev = vma;
-		if (!file)
-			force_swapin_readahead(vma, start, end);
-		else
-			force_shm_swapin_readahead(vma, start, end,
-						file->f_mapping);
+		force_swapin_readahead(vma, start, end);
 		return 0;
 	}
-#endif
 
+	if (shmem_mapping(file->f_mapping)) {
+		*prev = vma;
+		force_shm_swapin_readahead(vma, start, end,
+					file->f_mapping);
+		return 0;
+	}
+#else
 	if (!file)
 		return -EBADF;
+#endif
 
 	if (file->f_mapping->a_ops->get_xip_mem) {
 		/* no bad return value, but ignore advice */
diff --git a/mm/shmem.c b/mm/shmem.c
index 73ba1df7c8ba..1b77eaf589fd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -191,11 +191,6 @@ static const struct inode_operations shmem_dir_inode_operations;
 static const struct inode_operations shmem_special_inode_operations;
 static const struct vm_operations_struct shmem_vm_ops;
 
-static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
-
 static LIST_HEAD(shmem_swaplist);
 static DEFINE_MUTEX(shmem_swaplist_mutex);
 
@@ -765,11 +760,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		goto redirty;
 
 	/*
-	 * shmem_backing_dev_info's capabilities prevent regular writeback or
-	 * sync from ever calling shmem_writepage; but a stacking filesystem
-	 * might use ->writepage of its underlying filesystem, in which case
-	 * tmpfs should write out to swap only in response to memory pressure,
-	 * and not for the writeback threads or sync.
+	 * Our capabilities prevent regular writeback or sync from ever calling
+	 * shmem_writepage; but a stacking filesystem might use ->writepage of
+	 * its underlying filesystem, in which case tmpfs should write out to
+	 * swap only in response to memory pressure, and not for the writeback
+	 * threads or sync.
 	 */
 	if (!wbc->for_reclaim) {
 		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
@@ -1415,7 +1410,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_blocks = 0;
-		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
+		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_generation = get_seconds();
 		info = SHMEM_I(inode);
@@ -1461,7 +1456,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 
 bool shmem_mapping(struct address_space *mapping)
 {
-	return mapping->backing_dev_info == &shmem_backing_dev_info;
+	return mapping->host->i_sb->s_op == &shmem_ops;
 }
 
 #ifdef CONFIG_TMPFS
@@ -3226,10 +3221,6 @@ int __init shmem_init(void)
 	if (shmem_inode_cachep)
 		return 0;
 
-	error = bdi_init(&shmem_backing_dev_info);
-	if (error)
-		goto out4;
-
 	error = shmem_init_inodecache();
 	if (error)
 		goto out3;
@@ -3253,8 +3244,6 @@ out1:
 out2:
 	shmem_destroy_inodecache();
 out3:
-	bdi_destroy(&shmem_backing_dev_info);
-out4:
 	shm_mnt = ERR_PTR(error);
 	return error;
 }
diff --git a/mm/swap.c b/mm/swap.c
index 8a12b33936b4..4e0109a2f37b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1138,8 +1138,6 @@ void __init swap_setup(void)
 #ifdef CONFIG_SWAP
 	int i;
 
-	if (bdi_init(swapper_spaces[0].backing_dev_info))
-		panic("Failed to init swap bdi");
 	for (i = 0; i < MAX_SWAPFILES; i++) {
 		spin_lock_init(&swapper_spaces[i].tree_lock);
 		INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9711342987a0..1c137b69ecde 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -32,17 +32,12 @@ static const struct address_space_operations swap_aops = {
 #endif
 };
 
-static struct backing_dev_info swap_backing_dev_info = {
-	.name		= "swap",
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
-
 struct address_space swapper_spaces[MAX_SWAPFILES] = {
 	[0 ... MAX_SWAPFILES - 1] = {
 		.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
 		.i_mmap_writable = ATOMIC_INIT(0),
 		.a_ops		= &swap_aops,
-		.backing_dev_info = &swap_backing_dev_info,
+		.backing_dev_info = &noop_backing_dev_info,
 	}
 };
 
-- 
cgit v1.2.3


From b4caecd48005fbed3949dde6c1cb233142fd69e9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:32 +0100
Subject: fs: introduce f_op->mmap_capabilities for nommu mmap support

Since "BDI: Provide backing device capability information [try #3]" the
backing_dev_info structure also provides flags for the kind of mmap
operation available in a nommu environment, which is entirely unrelated
to it's original purpose.

Introduce a new nommu-only file operation to provide this information to
the nommu mmap code instead.  Splitting this from the backing_dev_info
structure allows to remove lots of backing_dev_info instance that aren't
otherwise needed, and entirely gets rid of the concept of providing a
backing_dev_info for a character device.  It also removes the need for
the mtd_inodefs filesystem.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tejun Heo <tj@kernel.org>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/nommu-mmap.txt                    |  8 +--
 block/blk-core.c                                |  2 +-
 drivers/char/mem.c                              | 64 ++++++++++----------
 drivers/mtd/mtdchar.c                           | 72 ++++------------------
 drivers/mtd/mtdconcat.c                         | 10 ----
 drivers/mtd/mtdcore.c                           | 80 +++++++------------------
 drivers/mtd/mtdpart.c                           |  1 -
 drivers/staging/lustre/lustre/llite/llite_lib.c |  2 +-
 fs/9p/v9fs.c                                    |  2 +-
 fs/afs/volume.c                                 |  2 +-
 fs/aio.c                                        | 14 +----
 fs/btrfs/disk-io.c                              |  3 +-
 fs/char_dev.c                                   | 24 --------
 fs/cifs/connect.c                               |  2 +-
 fs/coda/inode.c                                 |  2 +-
 fs/configfs/configfs_internal.h                 |  2 -
 fs/configfs/inode.c                             | 18 +-----
 fs/configfs/mount.c                             | 11 +---
 fs/ecryptfs/main.c                              |  2 +-
 fs/exofs/super.c                                |  2 +-
 fs/ncpfs/inode.c                                |  2 +-
 fs/ramfs/file-nommu.c                           |  7 +++
 fs/ramfs/inode.c                                | 22 +------
 fs/romfs/mmap-nommu.c                           | 10 ++++
 fs/ubifs/super.c                                |  2 +-
 include/linux/backing-dev.h                     | 33 ++--------
 include/linux/cdev.h                            |  2 -
 include/linux/fs.h                              | 23 +++++++
 include/linux/mtd/mtd.h                         |  2 +
 mm/backing-dev.c                                |  7 +--
 mm/nommu.c                                      | 69 ++++++++++-----------
 security/security.c                             | 13 ++--
 32 files changed, 169 insertions(+), 346 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt
index 8e1ddec2c78a..ae57b9ea0d41 100644
--- a/Documentation/nommu-mmap.txt
+++ b/Documentation/nommu-mmap.txt
@@ -43,12 +43,12 @@ and it's also much more restricted in the latter case:
            even if this was created by another process.
 
          - If possible, the file mapping will be directly on the backing device
-           if the backing device has the BDI_CAP_MAP_DIRECT capability and
+           if the backing device has the NOMMU_MAP_DIRECT capability and
            appropriate mapping protection capabilities. Ramfs, romfs, cramfs
            and mtd might all permit this.
 
 	 - If the backing device device can't or won't permit direct sharing,
-           but does have the BDI_CAP_MAP_COPY capability, then a copy of the
+           but does have the NOMMU_MAP_COPY capability, then a copy of the
            appropriate bit of the file will be read into a contiguous bit of
            memory and any extraneous space beyond the EOF will be cleared
 
@@ -220,7 +220,7 @@ directly (can't be copied).
 
 The file->f_op->mmap() operation will be called to actually inaugurate the
 mapping. It can be rejected at that point. Returning the ENOSYS error will
-cause the mapping to be copied instead if BDI_CAP_MAP_COPY is specified.
+cause the mapping to be copied instead if NOMMU_MAP_COPY is specified.
 
 The vm_ops->close() routine will be invoked when the last mapping on a chardev
 is removed. An existing mapping will be shared, partially or not, if possible
@@ -232,7 +232,7 @@ want to handle it, despite the fact it's got an operation. For instance, it
 might try directing the call to a secondary driver which turns out not to
 implement it. Such is the case for the framebuffer driver which attempts to
 direct the call to the device-specific driver. Under such circumstances, the
-mapping request will be rejected if BDI_CAP_MAP_COPY is not specified, and a
+mapping request will be rejected if NOMMU_MAP_COPY is not specified, and a
 copy mapped otherwise.
 
 IMPORTANT NOTE:
diff --git a/block/blk-core.c b/block/blk-core.c
index 3ad405571dcc..928aac29bccd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -607,7 +607,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.state = 0;
-	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
+	q->backing_dev_info.capabilities = 0;
 	q->backing_dev_info.name = "block";
 	q->node = node_id;
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 4c58333b4257..9a6b63783a94 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -287,13 +287,24 @@ static unsigned long get_unmapped_area_mem(struct file *file,
 	return pgoff << PAGE_SHIFT;
 }
 
+/* permit direct mmap, for read, write or exec */
+static unsigned memory_mmap_capabilities(struct file *file)
+{
+	return NOMMU_MAP_DIRECT |
+		NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
+
+static unsigned zero_mmap_capabilities(struct file *file)
+{
+	return NOMMU_MAP_COPY;
+}
+
 /* can't do an in-place private mapping if there's no MMU */
 static inline int private_mapping_ok(struct vm_area_struct *vma)
 {
 	return vma->vm_flags & VM_MAYSHARE;
 }
 #else
-#define get_unmapped_area_mem	NULL
 
 static inline int private_mapping_ok(struct vm_area_struct *vma)
 {
@@ -721,7 +732,10 @@ static const struct file_operations mem_fops = {
 	.write		= write_mem,
 	.mmap		= mmap_mem,
 	.open		= open_mem,
+#ifndef CONFIG_MMU
 	.get_unmapped_area = get_unmapped_area_mem,
+	.mmap_capabilities = memory_mmap_capabilities,
+#endif
 };
 
 #ifdef CONFIG_DEVKMEM
@@ -731,7 +745,10 @@ static const struct file_operations kmem_fops = {
 	.write		= write_kmem,
 	.mmap		= mmap_kmem,
 	.open		= open_kmem,
+#ifndef CONFIG_MMU
 	.get_unmapped_area = get_unmapped_area_mem,
+	.mmap_capabilities = memory_mmap_capabilities,
+#endif
 };
 #endif
 
@@ -760,16 +777,9 @@ static const struct file_operations zero_fops = {
 	.read_iter	= read_iter_zero,
 	.aio_write	= aio_write_zero,
 	.mmap		= mmap_zero,
-};
-
-/*
- * capabilities for /dev/zero
- * - permits private mappings, "copies" are taken of the source of zeros
- * - no writeback happens
- */
-static struct backing_dev_info zero_bdi = {
-	.name		= "char/mem",
-	.capabilities	= BDI_CAP_MAP_COPY | BDI_CAP_NO_ACCT_AND_WRITEBACK,
+#ifndef CONFIG_MMU
+	.mmap_capabilities = zero_mmap_capabilities,
+#endif
 };
 
 static const struct file_operations full_fops = {
@@ -783,22 +793,22 @@ static const struct memdev {
 	const char *name;
 	umode_t mode;
 	const struct file_operations *fops;
-	struct backing_dev_info *dev_info;
+	fmode_t fmode;
 } devlist[] = {
-	 [1] = { "mem", 0, &mem_fops, &directly_mappable_cdev_bdi },
+	 [1] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET },
 #ifdef CONFIG_DEVKMEM
-	 [2] = { "kmem", 0, &kmem_fops, &directly_mappable_cdev_bdi },
+	 [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET },
 #endif
-	 [3] = { "null", 0666, &null_fops, NULL },
+	 [3] = { "null", 0666, &null_fops, 0 },
 #ifdef CONFIG_DEVPORT
-	 [4] = { "port", 0, &port_fops, NULL },
+	 [4] = { "port", 0, &port_fops, 0 },
 #endif
-	 [5] = { "zero", 0666, &zero_fops, &zero_bdi },
-	 [7] = { "full", 0666, &full_fops, NULL },
-	 [8] = { "random", 0666, &random_fops, NULL },
-	 [9] = { "urandom", 0666, &urandom_fops, NULL },
+	 [5] = { "zero", 0666, &zero_fops, 0 },
+	 [7] = { "full", 0666, &full_fops, 0 },
+	 [8] = { "random", 0666, &random_fops, 0 },
+	 [9] = { "urandom", 0666, &urandom_fops, 0 },
 #ifdef CONFIG_PRINTK
-	[11] = { "kmsg", 0644, &kmsg_fops, NULL },
+	[11] = { "kmsg", 0644, &kmsg_fops, 0 },
 #endif
 };
 
@@ -816,12 +826,7 @@ static int memory_open(struct inode *inode, struct file *filp)
 		return -ENXIO;
 
 	filp->f_op = dev->fops;
-	if (dev->dev_info)
-		filp->f_mapping->backing_dev_info = dev->dev_info;
-
-	/* Is /dev/mem or /dev/kmem ? */
-	if (dev->dev_info == &directly_mappable_cdev_bdi)
-		filp->f_mode |= FMODE_UNSIGNED_OFFSET;
+	filp->f_mode |= dev->fmode;
 
 	if (dev->fops->open)
 		return dev->fops->open(inode, filp);
@@ -846,11 +851,6 @@ static struct class *mem_class;
 static int __init chr_dev_init(void)
 {
 	int minor;
-	int err;
-
-	err = bdi_init(&zero_bdi);
-	if (err)
-		return err;
 
 	if (register_chrdev(MEM_MAJOR, "mem", &memory_fops))
 		printk("unable to get major %d for memory devs\n", MEM_MAJOR);
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 53563955931b..55fa27ecf4e1 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -49,7 +49,6 @@ static DEFINE_MUTEX(mtd_mutex);
  */
 struct mtd_file_info {
 	struct mtd_info *mtd;
-	struct inode *ino;
 	enum mtd_file_modes mode;
 };
 
@@ -59,10 +58,6 @@ static loff_t mtdchar_lseek(struct file *file, loff_t offset, int orig)
 	return fixed_size_llseek(file, offset, orig, mfi->mtd->size);
 }
 
-static int count;
-static struct vfsmount *mnt;
-static struct file_system_type mtd_inodefs_type;
-
 static int mtdchar_open(struct inode *inode, struct file *file)
 {
 	int minor = iminor(inode);
@@ -70,7 +65,6 @@ static int mtdchar_open(struct inode *inode, struct file *file)
 	int ret = 0;
 	struct mtd_info *mtd;
 	struct mtd_file_info *mfi;
-	struct inode *mtd_ino;
 
 	pr_debug("MTD_open\n");
 
@@ -78,10 +72,6 @@ static int mtdchar_open(struct inode *inode, struct file *file)
 	if ((file->f_mode & FMODE_WRITE) && (minor & 1))
 		return -EACCES;
 
-	ret = simple_pin_fs(&mtd_inodefs_type, &mnt, &count);
-	if (ret)
-		return ret;
-
 	mutex_lock(&mtd_mutex);
 	mtd = get_mtd_device(NULL, devnum);
 
@@ -95,43 +85,26 @@ static int mtdchar_open(struct inode *inode, struct file *file)
 		goto out1;
 	}
 
-	mtd_ino = iget_locked(mnt->mnt_sb, devnum);
-	if (!mtd_ino) {
-		ret = -ENOMEM;
-		goto out1;
-	}
-	if (mtd_ino->i_state & I_NEW) {
-		mtd_ino->i_private = mtd;
-		mtd_ino->i_mode = S_IFCHR;
-		mtd_ino->i_data.backing_dev_info = mtd->backing_dev_info;
-		unlock_new_inode(mtd_ino);
-	}
-	file->f_mapping = mtd_ino->i_mapping;
-
 	/* You can't open it RW if it's not a writeable device */
 	if ((file->f_mode & FMODE_WRITE) && !(mtd->flags & MTD_WRITEABLE)) {
 		ret = -EACCES;
-		goto out2;
+		goto out1;
 	}
 
 	mfi = kzalloc(sizeof(*mfi), GFP_KERNEL);
 	if (!mfi) {
 		ret = -ENOMEM;
-		goto out2;
+		goto out1;
 	}
-	mfi->ino = mtd_ino;
 	mfi->mtd = mtd;
 	file->private_data = mfi;
 	mutex_unlock(&mtd_mutex);
 	return 0;
 
-out2:
-	iput(mtd_ino);
 out1:
 	put_mtd_device(mtd);
 out:
 	mutex_unlock(&mtd_mutex);
-	simple_release_fs(&mnt, &count);
 	return ret;
 } /* mtdchar_open */
 
@@ -148,12 +121,9 @@ static int mtdchar_close(struct inode *inode, struct file *file)
 	if ((file->f_mode & FMODE_WRITE))
 		mtd_sync(mtd);
 
-	iput(mfi->ino);
-
 	put_mtd_device(mtd);
 	file->private_data = NULL;
 	kfree(mfi);
-	simple_release_fs(&mnt, &count);
 
 	return 0;
 } /* mtdchar_close */
@@ -1117,6 +1087,13 @@ static unsigned long mtdchar_get_unmapped_area(struct file *file,
 	ret = mtd_get_unmapped_area(mtd, len, offset, flags);
 	return ret == -EOPNOTSUPP ? -ENODEV : ret;
 }
+
+static unsigned mtdchar_mmap_capabilities(struct file *file)
+{
+	struct mtd_file_info *mfi = file->private_data;
+
+	return mtd_mmap_capabilities(mfi->mtd);
+}
 #endif
 
 /*
@@ -1160,27 +1137,10 @@ static const struct file_operations mtd_fops = {
 	.mmap		= mtdchar_mmap,
 #ifndef CONFIG_MMU
 	.get_unmapped_area = mtdchar_get_unmapped_area,
+	.mmap_capabilities = mtdchar_mmap_capabilities,
 #endif
 };
 
-static const struct super_operations mtd_ops = {
-	.drop_inode = generic_delete_inode,
-	.statfs = simple_statfs,
-};
-
-static struct dentry *mtd_inodefs_mount(struct file_system_type *fs_type,
-				int flags, const char *dev_name, void *data)
-{
-	return mount_pseudo(fs_type, "mtd_inode:", &mtd_ops, NULL, MTD_INODE_FS_MAGIC);
-}
-
-static struct file_system_type mtd_inodefs_type = {
-       .name = "mtd_inodefs",
-       .mount = mtd_inodefs_mount,
-       .kill_sb = kill_anon_super,
-};
-MODULE_ALIAS_FS("mtd_inodefs");
-
 int __init init_mtdchar(void)
 {
 	int ret;
@@ -1193,23 +1153,11 @@ int __init init_mtdchar(void)
 		return ret;
 	}
 
-	ret = register_filesystem(&mtd_inodefs_type);
-	if (ret) {
-		pr_err("Can't register mtd_inodefs filesystem, error %d\n",
-		       ret);
-		goto err_unregister_chdev;
-	}
-
-	return ret;
-
-err_unregister_chdev:
-	__unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
 	return ret;
 }
 
 void __exit cleanup_mtdchar(void)
 {
-	unregister_filesystem(&mtd_inodefs_type);
 	__unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
 }
 
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index b9000563b9f4..eacc3aac7327 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -732,8 +732,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 
 	concat->mtd.ecc_stats.badblocks = subdev[0]->ecc_stats.badblocks;
 
-	concat->mtd.backing_dev_info = subdev[0]->backing_dev_info;
-
 	concat->subdev[0] = subdev[0];
 
 	for (i = 1; i < num_devs; i++) {
@@ -761,14 +759,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 				    subdev[i]->flags & MTD_WRITEABLE;
 		}
 
-		/* only permit direct mapping if the BDIs are all the same
-		 * - copy-mapping is still permitted
-		 */
-		if (concat->mtd.backing_dev_info !=
-		    subdev[i]->backing_dev_info)
-			concat->mtd.backing_dev_info =
-				&default_backing_dev_info;
-
 		concat->mtd.size += subdev[i]->size;
 		concat->mtd.ecc_stats.badblocks +=
 			subdev[i]->ecc_stats.badblocks;
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 4c611871d7e6..ff38a1df22f7 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -43,33 +43,7 @@
 
 #include "mtdcore.h"
 
-/*
- * backing device capabilities for non-mappable devices (such as NAND flash)
- * - permits private mappings, copies are taken of the data
- */
-static struct backing_dev_info mtd_bdi_unmappable = {
-	.capabilities	= BDI_CAP_MAP_COPY,
-};
-
-/*
- * backing device capabilities for R/O mappable devices (such as ROM)
- * - permits private mappings, copies are taken of the data
- * - permits non-writable shared mappings
- */
-static struct backing_dev_info mtd_bdi_ro_mappable = {
-	.capabilities	= (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
-			   BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP),
-};
-
-/*
- * backing device capabilities for writable mappable devices (such as RAM)
- * - permits private mappings, copies are taken of the data
- * - permits non-writable shared mappings
- */
-static struct backing_dev_info mtd_bdi_rw_mappable = {
-	.capabilities	= (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
-			   BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP |
-			   BDI_CAP_WRITE_MAP),
+static struct backing_dev_info mtd_bdi = {
 };
 
 static int mtd_cls_suspend(struct device *dev, pm_message_t state);
@@ -365,6 +339,22 @@ static struct device_type mtd_devtype = {
 	.release	= mtd_release,
 };
 
+#ifndef CONFIG_MMU
+unsigned mtd_mmap_capabilities(struct mtd_info *mtd)
+{
+	switch (mtd->type) {
+	case MTD_RAM:
+		return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC |
+			NOMMU_MAP_READ | NOMMU_MAP_WRITE;
+	case MTD_ROM:
+		return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC |
+			NOMMU_MAP_READ;
+	default:
+		return NOMMU_MAP_COPY;
+	}
+}
+#endif
+
 /**
  *	add_mtd_device - register an MTD device
  *	@mtd: pointer to new MTD device info structure
@@ -380,19 +370,7 @@ int add_mtd_device(struct mtd_info *mtd)
 	struct mtd_notifier *not;
 	int i, error;
 
-	if (!mtd->backing_dev_info) {
-		switch (mtd->type) {
-		case MTD_RAM:
-			mtd->backing_dev_info = &mtd_bdi_rw_mappable;
-			break;
-		case MTD_ROM:
-			mtd->backing_dev_info = &mtd_bdi_ro_mappable;
-			break;
-		default:
-			mtd->backing_dev_info = &mtd_bdi_unmappable;
-			break;
-		}
-	}
+	mtd->backing_dev_info = &mtd_bdi;
 
 	BUG_ON(mtd->writesize == 0);
 	mutex_lock(&mtd_table_mutex);
@@ -1237,17 +1215,9 @@ static int __init init_mtd(void)
 	if (ret)
 		goto err_reg;
 
-	ret = mtd_bdi_init(&mtd_bdi_unmappable, "mtd-unmap");
-	if (ret)
-		goto err_bdi1;
-
-	ret = mtd_bdi_init(&mtd_bdi_ro_mappable, "mtd-romap");
-	if (ret)
-		goto err_bdi2;
-
-	ret = mtd_bdi_init(&mtd_bdi_rw_mappable, "mtd-rwmap");
+	ret = mtd_bdi_init(&mtd_bdi, "mtd");
 	if (ret)
-		goto err_bdi3;
+		goto err_bdi;
 
 	proc_mtd = proc_create("mtd", 0, NULL, &mtd_proc_ops);
 
@@ -1260,11 +1230,7 @@ static int __init init_mtd(void)
 out_procfs:
 	if (proc_mtd)
 		remove_proc_entry("mtd", NULL);
-err_bdi3:
-	bdi_destroy(&mtd_bdi_ro_mappable);
-err_bdi2:
-	bdi_destroy(&mtd_bdi_unmappable);
-err_bdi1:
+err_bdi:
 	class_unregister(&mtd_class);
 err_reg:
 	pr_err("Error registering mtd class or bdi: %d\n", ret);
@@ -1277,9 +1243,7 @@ static void __exit cleanup_mtd(void)
 	if (proc_mtd)
 		remove_proc_entry("mtd", NULL);
 	class_unregister(&mtd_class);
-	bdi_destroy(&mtd_bdi_unmappable);
-	bdi_destroy(&mtd_bdi_ro_mappable);
-	bdi_destroy(&mtd_bdi_rw_mappable);
+	bdi_destroy(&mtd_bdi);
 }
 
 module_init(init_mtd);
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index a3e3a7d074d5..e779de315ade 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -378,7 +378,6 @@ static struct mtd_part *allocate_partition(struct mtd_info *master,
 
 	slave->mtd.name = name;
 	slave->mtd.owner = master->owner;
-	slave->mtd.backing_dev_info = master->backing_dev_info;
 
 	/* NOTE:  we don't arrange MTDs as a tree; it'd be error-prone
 	 * to have the same data be in two different partitions.
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index a3367bfb1456..d5b149c5542d 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -987,7 +987,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 	if (err)
 		goto out_free;
 	lsi->lsi_flags |= LSI_BDI_INITIALIZED;
-	lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+	lsi->lsi_bdi.capabilities = 0;
 	err = ll_bdi_register(&lsi->lsi_bdi);
 	if (err)
 		goto out_free;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6894b085f0ee..620d93489539 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -335,7 +335,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	}
 	init_rwsem(&v9ses->rename_sem);
 
-	rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
+	rc = bdi_setup_and_register(&v9ses->bdi, "9p");
 	if (rc) {
 		kfree(v9ses->aname);
 		kfree(v9ses->uname);
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 2b607257820c..d142a2449e65 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,7 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 	volume->cell		= params->cell;
 	volume->vid		= vlocation->vldb.vid[params->type];
 
-	ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
+	ret = bdi_setup_and_register(&volume->bdi, "afs");
 	if (ret)
 		goto error_bdi;
 
diff --git a/fs/aio.c b/fs/aio.c
index 1b7893ecc296..6f13d3fab07f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -165,15 +165,6 @@ static struct vfsmount *aio_mnt;
 static const struct file_operations aio_ring_fops;
 static const struct address_space_operations aio_ctx_aops;
 
-/* Backing dev info for aio fs.
- * -no dirty page accounting or writeback happens
- */
-static struct backing_dev_info aio_fs_backing_dev_info = {
-	.name           = "aiofs",
-	.state          = 0,
-	.capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
-};
-
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
 	struct qstr this = QSTR_INIT("[aio]", 5);
@@ -185,7 +176,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 
 	inode->i_mapping->a_ops = &aio_ctx_aops;
 	inode->i_mapping->private_data = ctx;
-	inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
+	inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 	inode->i_size = PAGE_SIZE * nr_pages;
 
 	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -230,9 +221,6 @@ static int __init aio_setup(void)
 	if (IS_ERR(aio_mnt))
 		panic("Failed to create aio fs mount.");
 
-	if (bdi_init(&aio_fs_backing_dev_info))
-		panic("Failed to init aio fs backing dev info.");
-
 	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8c63419a7f70..afc4092989cd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1715,8 +1715,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
 	int err;
 
-	bdi->capabilities = BDI_CAP_MAP_COPY;
-	err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
+	err = bdi_setup_and_register(bdi, "btrfs");
 	if (err)
 		return err;
 
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 67b2007f10fe..ea06a3d0364c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -24,27 +24,6 @@
 
 #include "internal.h"
 
-/*
- * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
- * devices
- * - permits shared-mmap for read, write and/or exec
- * - does not permit private mmap in NOMMU mode (can't do COW)
- * - no readahead or I/O queue unplugging required
- */
-struct backing_dev_info directly_mappable_cdev_bdi = {
-	.name = "char",
-	.capabilities	= (
-#ifdef CONFIG_MMU
-		/* permit private copies of the data to be taken */
-		BDI_CAP_MAP_COPY |
-#endif
-		/* permit direct mmap, for read, write or exec */
-		BDI_CAP_MAP_DIRECT |
-		BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
-		/* no writeback happens */
-		BDI_CAP_NO_ACCT_AND_WRITEBACK),
-};
-
 static struct kobj_map *cdev_map;
 
 static DEFINE_MUTEX(chrdevs_lock);
@@ -575,8 +554,6 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
 void __init chrdev_init(void)
 {
 	cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
-	if (bdi_init(&directly_mappable_cdev_bdi))
-		panic("Failed to init directly mappable cdev bdi");
 }
 
 
@@ -590,4 +567,3 @@ EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
-EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2a772da16b83..d3aa999ab785 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3446,7 +3446,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
 	int referral_walks_count = 0;
 #endif
 
-	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
 	if (rc)
 		return rc;
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index b945410bfcd5..82ec68b59208 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,7 +183,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 		goto unlock_out;
 	}
 
-	error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
+	error = bdi_setup_and_register(&vc->bdi, "coda");
 	if (error)
 		goto unlock_out;
 
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index bd4a3c167091..a315677e44d3 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -70,8 +70,6 @@ extern int configfs_is_root(struct config_item *item);
 
 extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
 extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
-extern int configfs_inode_init(void);
-extern void configfs_inode_exit(void);
 
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5946ad98053f..0ad6b4d6de00 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -50,12 +50,6 @@ static const struct address_space_operations configfs_aops = {
 	.write_end	= simple_write_end,
 };
 
-static struct backing_dev_info configfs_backing_dev_info = {
-	.name		= "configfs",
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
 static const struct inode_operations configfs_inode_operations ={
 	.setattr	= configfs_setattr,
 };
@@ -137,7 +131,7 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode->i_mapping->a_ops = &configfs_aops;
-		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
+		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_op = &configfs_inode_operations;
 
 		if (sd->s_iattr) {
@@ -283,13 +277,3 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 	}
 	mutex_unlock(&dir->d_inode->i_mutex);
 }
-
-int __init configfs_inode_init(void)
-{
-	return bdi_init(&configfs_backing_dev_info);
-}
-
-void configfs_inode_exit(void)
-{
-	bdi_destroy(&configfs_backing_dev_info);
-}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f6c285833390..da94e41bdbf6 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -145,19 +145,13 @@ static int __init configfs_init(void)
 	if (!config_kobj)
 		goto out2;
 
-	err = configfs_inode_init();
-	if (err)
-		goto out3;
-
 	err = register_filesystem(&configfs_fs_type);
 	if (err)
-		goto out4;
+		goto out3;
 
 	return 0;
-out4:
-	pr_err("Unable to register filesystem!\n");
-	configfs_inode_exit();
 out3:
+	pr_err("Unable to register filesystem!\n");
 	kobject_put(config_kobj);
 out2:
 	kmem_cache_destroy(configfs_dir_cachep);
@@ -172,7 +166,6 @@ static void __exit configfs_exit(void)
 	kobject_put(config_kobj);
 	kmem_cache_destroy(configfs_dir_cachep);
 	configfs_dir_cachep = NULL;
-	configfs_inode_exit();
 }
 
 MODULE_AUTHOR("Oracle");
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d9eb84bda559..1895d60f4122 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -520,7 +520,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 		goto out;
 	}
 
-	rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
+	rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
 	if (rc)
 		goto out1;
 
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 95965503afcb..fcc2e565f540 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -836,7 +836,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+	ret = bdi_setup_and_register(&sbi->bdi, "exofs");
 	if (ret) {
 		EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
 		dput(sb->s_root);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index e31e589369a4..a699a3fc62c0 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -560,7 +560,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	server = NCP_SBP(sb);
 	memset(server, 0, sizeof(*server));
 
-	error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
+	error = bdi_setup_and_register(&server->bdi, "ncpfs");
 	if (error)
 		goto out_fput;
 
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index bbafbde3471a..f6ab41b39612 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -34,7 +34,14 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 						   unsigned long flags);
 static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
 
+static unsigned ramfs_mmap_capabilities(struct file *file)
+{
+	return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ |
+		NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
+
 const struct file_operations ramfs_file_operations = {
+	.mmap_capabilities	= ramfs_mmap_capabilities,
 	.mmap			= ramfs_nommu_mmap,
 	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
 	.read			= new_sync_read,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d365b1c4eb3c..ad4d712002f4 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -50,14 +50,6 @@ static const struct address_space_operations ramfs_aops = {
 	.set_page_dirty	= __set_page_dirty_no_writeback,
 };
 
-static struct backing_dev_info ramfs_backing_dev_info = {
-	.name		= "ramfs",
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK |
-			  BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
-			  BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
-};
-
 struct inode *ramfs_get_inode(struct super_block *sb,
 				const struct inode *dir, umode_t mode, dev_t dev)
 {
@@ -67,7 +59,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_mapping->a_ops = &ramfs_aops;
-		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
+		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
 		mapping_set_unevictable(inode->i_mapping);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -267,19 +259,9 @@ static struct file_system_type ramfs_fs_type = {
 int __init init_ramfs_fs(void)
 {
 	static unsigned long once;
-	int err;
 
 	if (test_and_set_bit(0, &once))
 		return 0;
-
-	err = bdi_init(&ramfs_backing_dev_info);
-	if (err)
-		return err;
-
-	err = register_filesystem(&ramfs_fs_type);
-	if (err)
-		bdi_destroy(&ramfs_backing_dev_info);
-
-	return err;
+	return register_filesystem(&ramfs_fs_type);
 }
 fs_initcall(init_ramfs_fs);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index ea06c7554860..7da9e2153953 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -70,6 +70,15 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
 	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
 }
 
+static unsigned romfs_mmap_capabilities(struct file *file)
+{
+	struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd;
+
+	if (!mtd)
+		return NOMMU_MAP_COPY;
+	return mtd_mmap_capabilities(mtd);
+}
+
 const struct file_operations romfs_ro_fops = {
 	.llseek			= generic_file_llseek,
 	.read			= new_sync_read,
@@ -77,4 +86,5 @@ const struct file_operations romfs_ro_fops = {
 	.splice_read		= generic_file_splice_read,
 	.mmap			= romfs_mmap,
 	.get_unmapped_area	= romfs_get_unmapped_area,
+	.mmap_capabilities	= romfs_mmap_capabilities,
 };
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 106bf20629ce..ed93dc6ae245 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2017,7 +2017,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
 	 */
 	c->bdi.name = "ubifs",
-	c->bdi.capabilities = BDI_CAP_MAP_COPY;
+	c->bdi.capabilities = 0;
 	err  = bdi_init(&c->bdi);
 	if (err)
 		goto out_close;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e936cea856dc..478f95d92d73 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -114,7 +114,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
-int __must_check bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
+int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
@@ -228,42 +228,17 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_NO_ACCT_DIRTY:  Dirty pages shouldn't contribute to accounting
  * BDI_CAP_NO_WRITEBACK:   Don't write pages back
  * BDI_CAP_NO_ACCT_WB:     Don't automatically account writeback pages
- *
- * These flags let !MMU mmap() govern direct device mapping vs immediate
- * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
- *
- * BDI_CAP_MAP_COPY:       Copy can be mapped (MAP_PRIVATE)
- * BDI_CAP_MAP_DIRECT:     Can be mapped directly (MAP_SHARED)
- * BDI_CAP_READ_MAP:       Can be mapped for reading
- * BDI_CAP_WRITE_MAP:      Can be mapped for writing
- * BDI_CAP_EXEC_MAP:       Can be mapped for execution
- *
  * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
 #define BDI_CAP_NO_WRITEBACK	0x00000002
-#define BDI_CAP_MAP_COPY	0x00000004
-#define BDI_CAP_MAP_DIRECT	0x00000008
-#define BDI_CAP_READ_MAP	0x00000010
-#define BDI_CAP_WRITE_MAP	0x00000020
-#define BDI_CAP_EXEC_MAP	0x00000040
-#define BDI_CAP_NO_ACCT_WB	0x00000080
-#define BDI_CAP_STABLE_WRITES	0x00000200
-#define BDI_CAP_STRICTLIMIT	0x00000400
-
-#define BDI_CAP_VMFLAGS \
-	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
+#define BDI_CAP_NO_ACCT_WB	0x00000004
+#define BDI_CAP_STABLE_WRITES	0x00000008
+#define BDI_CAP_STRICTLIMIT	0x00000010
 
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
 	(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
 
-#if defined(VM_MAYREAD) && \
-	(BDI_CAP_READ_MAP != VM_MAYREAD || \
-	 BDI_CAP_WRITE_MAP != VM_MAYWRITE || \
-	 BDI_CAP_EXEC_MAP != VM_MAYEXEC)
-#error please change backing_dev_info::capabilities flags
-#endif
-
 extern struct backing_dev_info default_backing_dev_info;
 extern struct backing_dev_info noop_backing_dev_info;
 
diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index fb4591977b03..f8763615a5f2 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -30,6 +30,4 @@ void cdev_del(struct cdev *);
 
 void cd_forget(struct inode *);
 
-extern struct backing_dev_info directly_mappable_cdev_bdi;
-
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42efe13077b6..1dada399aa23 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1502,6 +1502,26 @@ struct block_device_operations;
 #define HAVE_COMPAT_IOCTL 1
 #define HAVE_UNLOCKED_IOCTL 1
 
+/*
+ * These flags let !MMU mmap() govern direct device mapping vs immediate
+ * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
+ *
+ * NOMMU_MAP_COPY:	Copy can be mapped (MAP_PRIVATE)
+ * NOMMU_MAP_DIRECT:	Can be mapped directly (MAP_SHARED)
+ * NOMMU_MAP_READ:	Can be mapped for reading
+ * NOMMU_MAP_WRITE:	Can be mapped for writing
+ * NOMMU_MAP_EXEC:	Can be mapped for execution
+ */
+#define NOMMU_MAP_COPY		0x00000001
+#define NOMMU_MAP_DIRECT	0x00000008
+#define NOMMU_MAP_READ		VM_MAYREAD
+#define NOMMU_MAP_WRITE		VM_MAYWRITE
+#define NOMMU_MAP_EXEC		VM_MAYEXEC
+
+#define NOMMU_VMFLAGS \
+	(NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)
+
+
 struct iov_iter;
 
 struct file_operations {
@@ -1536,6 +1556,9 @@ struct file_operations {
 	long (*fallocate)(struct file *file, int mode, loff_t offset,
 			  loff_t len);
 	void (*show_fdinfo)(struct seq_file *m, struct file *f);
+#ifndef CONFIG_MMU
+	unsigned (*mmap_capabilities)(struct file *);
+#endif
 };
 
 struct inode_operations {
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 031ff3a9a0bd..3301c4c289d6 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -408,4 +408,6 @@ static inline int mtd_is_bitflip_or_eccerr(int err) {
 	return mtd_is_bitflip(err) || mtd_is_eccerr(err);
 }
 
+unsigned mtd_mmap_capabilities(struct mtd_info *mtd);
+
 #endif /* __MTD_MTD_H__ */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0ae0df55000b..16c68958aeda 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -17,8 +17,6 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 struct backing_dev_info default_backing_dev_info = {
 	.name		= "default",
 	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
-	.state		= 0,
-	.capabilities	= BDI_CAP_MAP_COPY,
 };
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
@@ -513,13 +511,12 @@ EXPORT_SYMBOL(bdi_destroy);
  * For use from filesystems to quickly init and register a bdi associated
  * with dirty writeback
  */
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
-			   unsigned int cap)
+int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
 {
 	int err;
 
 	bdi->name = name;
-	bdi->capabilities = cap;
+	bdi->capabilities = 0;
 	err = bdi_init(bdi);
 	if (err)
 		return err;
diff --git a/mm/nommu.c b/mm/nommu.c
index b51eadf6d952..13af96f35a4b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -946,9 +946,6 @@ static int validate_mmap_request(struct file *file,
 		return -EOVERFLOW;
 
 	if (file) {
-		/* validate file mapping requests */
-		struct address_space *mapping;
-
 		/* files must support mmap */
 		if (!file->f_op->mmap)
 			return -ENODEV;
@@ -957,28 +954,22 @@ static int validate_mmap_request(struct file *file,
 		 * - we support chardevs that provide their own "memory"
 		 * - we support files/blockdevs that are memory backed
 		 */
-		mapping = file->f_mapping;
-		if (!mapping)
-			mapping = file_inode(file)->i_mapping;
-
-		capabilities = 0;
-		if (mapping && mapping->backing_dev_info)
-			capabilities = mapping->backing_dev_info->capabilities;
-
-		if (!capabilities) {
+		if (file->f_op->mmap_capabilities) {
+			capabilities = file->f_op->mmap_capabilities(file);
+		} else {
 			/* no explicit capabilities set, so assume some
 			 * defaults */
 			switch (file_inode(file)->i_mode & S_IFMT) {
 			case S_IFREG:
 			case S_IFBLK:
-				capabilities = BDI_CAP_MAP_COPY;
+				capabilities = NOMMU_MAP_COPY;
 				break;
 
 			case S_IFCHR:
 				capabilities =
-					BDI_CAP_MAP_DIRECT |
-					BDI_CAP_READ_MAP |
-					BDI_CAP_WRITE_MAP;
+					NOMMU_MAP_DIRECT |
+					NOMMU_MAP_READ |
+					NOMMU_MAP_WRITE;
 				break;
 
 			default:
@@ -989,9 +980,9 @@ static int validate_mmap_request(struct file *file,
 		/* eliminate any capabilities that we can't support on this
 		 * device */
 		if (!file->f_op->get_unmapped_area)
-			capabilities &= ~BDI_CAP_MAP_DIRECT;
+			capabilities &= ~NOMMU_MAP_DIRECT;
 		if (!file->f_op->read)
-			capabilities &= ~BDI_CAP_MAP_COPY;
+			capabilities &= ~NOMMU_MAP_COPY;
 
 		/* The file shall have been opened with read permission. */
 		if (!(file->f_mode & FMODE_READ))
@@ -1010,29 +1001,29 @@ static int validate_mmap_request(struct file *file,
 			if (locks_verify_locked(file))
 				return -EAGAIN;
 
-			if (!(capabilities & BDI_CAP_MAP_DIRECT))
+			if (!(capabilities & NOMMU_MAP_DIRECT))
 				return -ENODEV;
 
 			/* we mustn't privatise shared mappings */
-			capabilities &= ~BDI_CAP_MAP_COPY;
+			capabilities &= ~NOMMU_MAP_COPY;
 		} else {
 			/* we're going to read the file into private memory we
 			 * allocate */
-			if (!(capabilities & BDI_CAP_MAP_COPY))
+			if (!(capabilities & NOMMU_MAP_COPY))
 				return -ENODEV;
 
 			/* we don't permit a private writable mapping to be
 			 * shared with the backing device */
 			if (prot & PROT_WRITE)
-				capabilities &= ~BDI_CAP_MAP_DIRECT;
+				capabilities &= ~NOMMU_MAP_DIRECT;
 		}
 
-		if (capabilities & BDI_CAP_MAP_DIRECT) {
-			if (((prot & PROT_READ)  && !(capabilities & BDI_CAP_READ_MAP))  ||
-			    ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
-			    ((prot & PROT_EXEC)  && !(capabilities & BDI_CAP_EXEC_MAP))
+		if (capabilities & NOMMU_MAP_DIRECT) {
+			if (((prot & PROT_READ)  && !(capabilities & NOMMU_MAP_READ))  ||
+			    ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
+			    ((prot & PROT_EXEC)  && !(capabilities & NOMMU_MAP_EXEC))
 			    ) {
-				capabilities &= ~BDI_CAP_MAP_DIRECT;
+				capabilities &= ~NOMMU_MAP_DIRECT;
 				if (flags & MAP_SHARED) {
 					printk(KERN_WARNING
 					       "MAP_SHARED not completely supported on !MMU\n");
@@ -1049,21 +1040,21 @@ static int validate_mmap_request(struct file *file,
 		} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
 			/* handle implication of PROT_EXEC by PROT_READ */
 			if (current->personality & READ_IMPLIES_EXEC) {
-				if (capabilities & BDI_CAP_EXEC_MAP)
+				if (capabilities & NOMMU_MAP_EXEC)
 					prot |= PROT_EXEC;
 			}
 		} else if ((prot & PROT_READ) &&
 			 (prot & PROT_EXEC) &&
-			 !(capabilities & BDI_CAP_EXEC_MAP)
+			 !(capabilities & NOMMU_MAP_EXEC)
 			 ) {
 			/* backing file is not executable, try to copy */
-			capabilities &= ~BDI_CAP_MAP_DIRECT;
+			capabilities &= ~NOMMU_MAP_DIRECT;
 		}
 	} else {
 		/* anonymous mappings are always memory backed and can be
 		 * privately mapped
 		 */
-		capabilities = BDI_CAP_MAP_COPY;
+		capabilities = NOMMU_MAP_COPY;
 
 		/* handle PROT_EXEC implication by PROT_READ */
 		if ((prot & PROT_READ) &&
@@ -1095,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
 	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
 	/* vm_flags |= mm->def_flags; */
 
-	if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
+	if (!(capabilities & NOMMU_MAP_DIRECT)) {
 		/* attempt to share read-only copies of mapped file chunks */
 		vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 		if (file && !(prot & PROT_WRITE))
@@ -1104,7 +1095,7 @@ static unsigned long determine_vm_flags(struct file *file,
 		/* overlay a shareable mapping on the backing device or inode
 		 * if possible - used for chardevs, ramfs/tmpfs/shmfs and
 		 * romfs/cramfs */
-		vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
+		vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
 		if (flags & MAP_SHARED)
 			vm_flags |= VM_SHARED;
 	}
@@ -1157,7 +1148,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
 	 * shared mappings on devices or memory
 	 * - VM_MAYSHARE will be set if it may attempt to share
 	 */
-	if (capabilities & BDI_CAP_MAP_DIRECT) {
+	if (capabilities & NOMMU_MAP_DIRECT) {
 		ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
 		if (ret == 0) {
 			/* shouldn't return success if we're not sharing */
@@ -1346,7 +1337,7 @@ unsigned long do_mmap_pgoff(struct file *file,
 			if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
 			    !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
 				/* new mapping is not a subset of the region */
-				if (!(capabilities & BDI_CAP_MAP_DIRECT))
+				if (!(capabilities & NOMMU_MAP_DIRECT))
 					goto sharing_violation;
 				continue;
 			}
@@ -1385,7 +1376,7 @@ unsigned long do_mmap_pgoff(struct file *file,
 		 * - this is the hook for quasi-memory character devices to
 		 *   tell us the location of a shared mapping
 		 */
-		if (capabilities & BDI_CAP_MAP_DIRECT) {
+		if (capabilities & NOMMU_MAP_DIRECT) {
 			addr = file->f_op->get_unmapped_area(file, addr, len,
 							     pgoff, flags);
 			if (IS_ERR_VALUE(addr)) {
@@ -1397,10 +1388,10 @@ unsigned long do_mmap_pgoff(struct file *file,
 				 * the mapping so we'll have to attempt to copy
 				 * it */
 				ret = -ENODEV;
-				if (!(capabilities & BDI_CAP_MAP_COPY))
+				if (!(capabilities & NOMMU_MAP_COPY))
 					goto error_just_free;
 
-				capabilities &= ~BDI_CAP_MAP_DIRECT;
+				capabilities &= ~NOMMU_MAP_DIRECT;
 			} else {
 				vma->vm_start = region->vm_start = addr;
 				vma->vm_end = region->vm_end = addr + len;
@@ -1411,7 +1402,7 @@ unsigned long do_mmap_pgoff(struct file *file,
 	vma->vm_region = region;
 
 	/* set up the mapping
-	 * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
+	 * - the region is filled in if NOMMU_MAP_DIRECT is still set
 	 */
 	if (file && vma->vm_flags & VM_SHARED)
 		ret = do_mmap_shared_file(vma);
diff --git a/security/security.c b/security/security.c
index 18b35c63fc0c..a0442b20f001 100644
--- a/security/security.c
+++ b/security/security.c
@@ -726,16 +726,15 @@ static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
 		return prot | PROT_EXEC;
 	/*
 	 * ditto if it's not on noexec mount, except that on !MMU we need
-	 * BDI_CAP_EXEC_MMAP (== VM_MAYEXEC) in this case
+	 * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
 	 */
 	if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) {
 #ifndef CONFIG_MMU
-		unsigned long caps = 0;
-		struct address_space *mapping = file->f_mapping;
-		if (mapping && mapping->backing_dev_info)
-			caps = mapping->backing_dev_info->capabilities;
-		if (!(caps & BDI_CAP_EXEC_MAP))
-			return prot;
+		if (file->f_op->mmap_capabilities) {
+			unsigned caps = file->f_op->mmap_capabilities(file);
+			if (!(caps & NOMMU_MAP_EXEC))
+				return prot;
+		}
 #endif
 		return prot | PROT_EXEC;
 	}
-- 
cgit v1.2.3


From de1414a654e66b81b5348dbc5259ecf2fb61655e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:36 +0100
Subject: fs: export inode_to_bdi and use it in favor of
 mapping->backing_dev_info

Now that we got rid of the bdi abuse on character devices we can always use
sb->s_bdi to get at the backing_dev_info for a file, except for the block
device special case.  Export inode_to_bdi and replace uses of
mapping->backing_dev_info with it to prepare for the removal of
mapping->backing_dev_info.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/btrfs/file.c                  |  2 +-
 fs/ceph/file.c                   |  2 +-
 fs/ext2/ialloc.c                 |  2 +-
 fs/ext4/super.c                  |  2 +-
 fs/fs-writeback.c                |  3 ++-
 fs/fuse/file.c                   | 10 +++++-----
 fs/gfs2/aops.c                   |  2 +-
 fs/gfs2/super.c                  |  2 +-
 fs/nfs/filelayout/filelayout.c   |  2 +-
 fs/nfs/write.c                   |  6 +++---
 fs/ntfs/file.c                   |  3 ++-
 fs/ocfs2/file.c                  |  2 +-
 fs/xfs/xfs_file.c                |  2 +-
 include/linux/backing-dev.h      |  6 ++++--
 include/trace/events/writeback.h |  6 +++---
 mm/fadvise.c                     |  4 ++--
 mm/filemap.c                     |  4 ++--
 mm/filemap_xip.c                 |  3 ++-
 mm/page-writeback.c              | 29 +++++++++++++----------------
 mm/readahead.c                   |  4 ++--
 mm/truncate.c                    |  2 +-
 mm/vmscan.c                      |  4 ++--
 22 files changed, 52 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..835c04a874fd 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1746,7 +1746,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 
 	mutex_lock(&inode->i_mutex);
 
-	current->backing_dev_info = inode->i_mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err) {
 		mutex_unlock(&inode->i_mutex);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce74b394b49d..905986dd4c3c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -945,7 +945,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	mutex_lock(&inode->i_mutex);
 
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = file->f_mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7d66fb0e4cca..6c14bb8322fa 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,7 +170,7 @@ static void ext2_preread_inode(struct inode *inode)
 	struct ext2_group_desc * gdp;
 	struct backing_dev_info *bdi;
 
-	bdi = inode->i_mapping->backing_dev_info;
+	bdi = inode_to_bdi(inode);
 	if (bdi_read_congested(bdi))
 		return;
 	if (bdi_write_congested(bdi))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 74c5f53595fb..ad88e601a6cd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -334,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func,
 static int block_device_ejected(struct super_block *sb)
 {
 	struct inode *bd_inode = sb->s_bdev->bd_inode;
-	struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
 
 	return bdi->dev == NULL;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e8116a44cc29..a20b1145f4d5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -66,7 +66,7 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(writeback_in_progress);
 
-static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 #ifdef CONFIG_BLOCK
@@ -75,6 +75,7 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
 #endif
 	return sb->s_bdi;
 }
+EXPORT_SYMBOL_GPL(inode_to_bdi);
 
 static inline struct inode *wb_inode(struct list_head *head)
 {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 760b2c552197..19d80b82d344 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1159,7 +1159,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	mutex_lock(&inode->i_mutex);
 
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
@@ -1464,7 +1464,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 {
 	struct inode *inode = req->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	int i;
 
 	list_del(&req->writepages_entry);
@@ -1658,7 +1658,7 @@ static int fuse_writepage_locked(struct page *page)
 	req->end = fuse_writepage_end;
 	req->inode = inode;
 
-	inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	spin_lock(&fc->lock);
@@ -1768,7 +1768,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
 
 	if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
 					old_req->state == FUSE_REQ_PENDING)) {
-		struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+		struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
 
 		copy_highpage(old_req->pages[0], page);
 		spin_unlock(&fc->lock);
@@ -1872,7 +1872,7 @@ static int fuse_writepages_fill(struct page *page,
 	req->page_descs[req->num_pages].offset = 0;
 	req->page_descs[req->num_pages].length = PAGE_SIZE;
 
-	inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	err = 0;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 805b37fed638..4ad4f94edebe 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -289,7 +289,7 @@ continue_unlock:
 		if (!clear_page_dirty_for_io(page))
 			goto continue_unlock;
 
-		trace_wbc_writepage(wbc, mapping->backing_dev_info);
+		trace_wbc_writepage(wbc, inode_to_bdi(inode));
 
 		ret = __gfs2_jdata_writepage(page, wbc);
 		if (unlikely(ret)) {
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 5b327f837de7..1666382b198d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -743,7 +743,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
-	struct backing_dev_info *bdi = metamapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(metamapping->host);
 	int ret = 0;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7afb52f6a25a..51aa889611cf 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -1081,7 +1081,7 @@ mds_commit:
 	spin_unlock(cinfo->lock);
 	if (!cinfo->dreq) {
 		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+		inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
 			     BDI_RECLAIMABLE);
 		__mark_inode_dirty(req->wb_context->dentry->d_inode,
 				   I_DIRTY_DATASYNC);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index af3af685a9e3..298abcc5281b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -786,7 +786,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
 	spin_unlock(cinfo->lock);
 	if (!cinfo->dreq) {
 		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+		inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
 			     BDI_RECLAIMABLE);
 		__mark_inode_dirty(req->wb_context->dentry->d_inode,
 				   I_DIRTY_DATASYNC);
@@ -853,7 +853,7 @@ static void
 nfs_clear_page_commit(struct page *page)
 {
 	dec_zone_page_state(page, NR_UNSTABLE_NFS);
-	dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
+	dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
 }
 
 /* Called holding inode (/cinfo) lock */
@@ -1564,7 +1564,7 @@ void nfs_retry_commit(struct list_head *page_list,
 		nfs_mark_request_commit(req, lseg, cinfo);
 		if (!cinfo->dreq) {
 			dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-			dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+			dec_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
 				     BDI_RECLAIMABLE);
 		}
 		nfs_unlock_and_release_request(req);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 643faa44f22b..1da9b2d184dc 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,6 +19,7 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/gfp.h>
 #include <linux/pagemap.h>
@@ -2091,7 +2092,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
 	count = iov_length(iov, nr_segs);
 	pos = *ppos;
 	/* We can write back this queue in page reclaim. */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 	written = 0;
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3950693dd0f6..abe7d98d6178 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2363,7 +2363,7 @@ relock:
 			goto out_dio;
 		}
 	} else {
-		current->backing_dev_info = file->f_mapping->backing_dev_info;
+		current->backing_dev_info = inode_to_bdi(inode);
 		written = generic_perform_write(file, from, *ppos);
 		if (likely(written >= 0))
 			iocb->ki_pos = *ppos + written;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 13e974e6a889..5684ac3e7d18 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -699,7 +699,7 @@ xfs_file_buffered_aio_write(
 
 	iov_iter_truncate(from, count);
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 
 write_retry:
 	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 478f95d92d73..ed59dee03a71 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -106,6 +106,8 @@ struct backing_dev_info {
 #endif
 };
 
+struct backing_dev_info *inode_to_bdi(struct inode *inode);
+
 int __must_check bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
 
@@ -303,12 +305,12 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
 
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
-	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
+	return bdi_cap_writeback_dirty(inode_to_bdi(mapping->host));
 }
 
 static inline bool mapping_cap_account_dirty(struct address_space *mapping)
 {
-	return bdi_cap_account_dirty(mapping->backing_dev_info);
+	return bdi_cap_account_dirty(inode_to_bdi(mapping->host));
 }
 
 static inline int bdi_sched_wait(void *word)
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index cee02d65ab3f..74f5207bd090 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -47,7 +47,7 @@ TRACE_EVENT(writeback_dirty_page,
 
 	TP_fast_assign(
 		strncpy(__entry->name,
-			mapping ? dev_name(mapping->backing_dev_info->dev) : "(unknown)", 32);
+			mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", 32);
 		__entry->ino = mapping ? mapping->host->i_ino : 0;
 		__entry->index = page->index;
 	),
@@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
 	),
 
 	TP_fast_assign(
-		struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+		struct backing_dev_info *bdi = inode_to_bdi(inode);
 
 		/* may be called for files on pseudo FSes w/ unregistered bdi */
 		strncpy(__entry->name,
@@ -116,7 +116,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
 
 	TP_fast_assign(
 		strncpy(__entry->name,
-			dev_name(inode->i_mapping->backing_dev_info->dev), 32);
+			dev_name(inode_to_bdi(inode)->dev), 32);
 		__entry->ino		= inode->i_ino;
 		__entry->sync_mode	= wbc->sync_mode;
 	),
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 2ad7adf4f0a4..fac23ecf8d72 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -73,7 +73,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 	else
 		endbyte--;		/* inclusive */
 
-	bdi = mapping->backing_dev_info;
+	bdi = inode_to_bdi(mapping->host);
 
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
@@ -113,7 +113,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 	case POSIX_FADV_NOREUSE:
 		break;
 	case POSIX_FADV_DONTNEED:
-		if (!bdi_write_congested(mapping->backing_dev_info))
+		if (!bdi_write_congested(bdi))
 			__filemap_fdatawrite_range(mapping, offset, endbyte,
 						   WB_SYNC_NONE);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 673e4581a2e5..5d7c23c26f81 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -211,7 +211,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 	 */
 	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
 		dec_zone_page_state(page, NR_FILE_DIRTY);
-		dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
 	}
 }
 
@@ -2565,7 +2565,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	size_t		count = iov_iter_count(from);
 
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
 		goto out;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0d105aeff82f..26897fbfbe19 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/backing-dev.h>
 #include <linux/pagemap.h>
 #include <linux/export.h>
 #include <linux/uio.h>
@@ -410,7 +411,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
 	count = len;
 
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 
 	ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
 	if (ret)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6f4335238e33..d4cbb4bd7d1c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1351,7 +1351,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 	unsigned long task_ratelimit;
 	unsigned long dirty_ratelimit;
 	unsigned long pos_ratio;
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
 	unsigned long start_time = jiffies;
 
@@ -1574,7 +1574,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
  */
 void balance_dirty_pages_ratelimited(struct address_space *mapping)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 	int ratelimit;
 	int *p;
 
@@ -1929,7 +1929,7 @@ continue_unlock:
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
-			trace_wbc_writepage(wbc, mapping->backing_dev_info);
+			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
 			ret = (*writepage)(page, wbc, data);
 			if (unlikely(ret)) {
 				if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -2094,10 +2094,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 	trace_writeback_dirty_page(page, mapping);
 
 	if (mapping_cap_account_dirty(mapping)) {
+		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_DIRTIED);
-		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
-		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+		__inc_bdi_stat(bdi, BDI_RECLAIMABLE);
+		__inc_bdi_stat(bdi, BDI_DIRTIED);
 		task_io_account_write(PAGE_CACHE_SIZE);
 		current->nr_dirtied++;
 		this_cpu_inc(bdp_ratelimits);
@@ -2156,7 +2158,7 @@ void account_page_redirty(struct page *page)
 	if (mapping && mapping_cap_account_dirty(mapping)) {
 		current->nr_dirtied--;
 		dec_zone_page_state(page, NR_DIRTIED);
-		dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+		dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
 	}
 }
 EXPORT_SYMBOL(account_page_redirty);
@@ -2295,7 +2297,7 @@ int clear_page_dirty_for_io(struct page *page)
 		 */
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
-			dec_bdi_stat(mapping->backing_dev_info,
+			dec_bdi_stat(inode_to_bdi(mapping->host),
 					BDI_RECLAIMABLE);
 			return 1;
 		}
@@ -2315,7 +2317,7 @@ int test_clear_page_writeback(struct page *page)
 
 	memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
 	if (mapping) {
-		struct backing_dev_info *bdi = mapping->backing_dev_info;
+		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 		unsigned long flags;
 
 		spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2352,7 +2354,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 
 	memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
 	if (mapping) {
-		struct backing_dev_info *bdi = mapping->backing_dev_info;
+		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 		unsigned long flags;
 
 		spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2406,12 +2408,7 @@ EXPORT_SYMBOL(mapping_tagged);
  */
 void wait_for_stable_page(struct page *page)
 {
-	struct address_space *mapping = page_mapping(page);
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-
-	if (!bdi_cap_stable_pages_required(bdi))
-		return;
-
-	wait_on_page_writeback(page);
+	if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
+		wait_on_page_writeback(page);
 }
 EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/readahead.c b/mm/readahead.c
index 17b9172ec37f..935675844b2e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -27,7 +27,7 @@
 void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 {
-	ra->ra_pages = mapping->backing_dev_info->ra_pages;
+	ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
 	ra->prev_pos = -1;
 }
 EXPORT_SYMBOL_GPL(file_ra_state_init);
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping,
 	/*
 	 * Defer asynchronous read-ahead on IO congestion.
 	 */
-	if (bdi_read_congested(mapping->backing_dev_info))
+	if (bdi_read_congested(inode_to_bdi(mapping->host)))
 		return;
 
 	/* do read-ahead */
diff --git a/mm/truncate.c b/mm/truncate.c
index f1e4d6052369..ddec5a5966d7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -112,7 +112,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
 		struct address_space *mapping = page->mapping;
 		if (mapping && mapping_cap_account_dirty(mapping)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
-			dec_bdi_stat(mapping->backing_dev_info,
+			dec_bdi_stat(inode_to_bdi(mapping->host),
 					BDI_RECLAIMABLE);
 			if (account_size)
 				task_io_account_cancelled_write(account_size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ab2505c3ef54..e00a16393f21 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -497,7 +497,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 	}
 	if (mapping->a_ops->writepage == NULL)
 		return PAGE_ACTIVATE;
-	if (!may_write_to_queue(mapping->backing_dev_info, sc))
+	if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
 		return PAGE_KEEP;
 
 	if (clear_page_dirty_for_io(page)) {
@@ -876,7 +876,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 */
 		mapping = page_mapping(page);
 		if (((dirty || writeback) && mapping &&
-		     bdi_write_congested(mapping->backing_dev_info)) ||
+		     bdi_write_congested(inode_to_bdi(mapping->host))) ||
 		    (writeback && PageReclaim(page)))
 			nr_congested++;
 
-- 
cgit v1.2.3


From b83ae6d421435c6204150300f1c25bfbd39cd62b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:37 +0100
Subject: fs: remove mapping->backing_dev_info

Now that we never use the backing_dev_info pointer in struct address_space
we can simply remove it and save 4 to 8 bytes in every inode.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/char/raw.c     |  4 +---
 fs/aio.c               |  1 -
 fs/block_dev.c         | 26 +-------------------------
 fs/btrfs/disk-io.c     |  1 -
 fs/btrfs/inode.c       |  6 ------
 fs/ceph/inode.c        |  2 --
 fs/cifs/inode.c        |  2 --
 fs/configfs/inode.c    |  1 -
 fs/ecryptfs/inode.c    |  1 -
 fs/exofs/inode.c       |  2 --
 fs/fuse/inode.c        |  1 -
 fs/gfs2/glock.c        |  1 -
 fs/gfs2/ops_fstype.c   |  1 -
 fs/hugetlbfs/inode.c   |  1 -
 fs/inode.c             | 13 -------------
 fs/kernfs/inode.c      |  1 -
 fs/ncpfs/inode.c       |  1 -
 fs/nfs/inode.c         |  1 -
 fs/nilfs2/gcinode.c    |  1 -
 fs/nilfs2/mdt.c        |  6 ++----
 fs/nilfs2/page.c       |  4 +---
 fs/nilfs2/page.h       |  3 +--
 fs/nilfs2/super.c      |  2 +-
 fs/ocfs2/dlmfs/dlmfs.c |  2 --
 fs/ramfs/inode.c       |  1 -
 fs/romfs/super.c       |  3 ---
 fs/ubifs/dir.c         |  2 --
 fs/ubifs/super.c       |  3 ---
 include/linux/fs.h     |  3 +--
 mm/backing-dev.c       |  1 -
 mm/shmem.c             |  1 -
 mm/swap_state.c        |  1 -
 32 files changed, 8 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index a24891b97547..6e29bf2db536 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -104,11 +104,9 @@ static int raw_release(struct inode *inode, struct file *filp)
 
 	mutex_lock(&raw_mutex);
 	bdev = raw_devices[minor].binding;
-	if (--raw_devices[minor].inuse == 0) {
+	if (--raw_devices[minor].inuse == 0)
 		/* Here  inode->i_mapping == bdev->bd_inode->i_mapping  */
 		inode->i_mapping = &inode->i_data;
-		inode->i_mapping->backing_dev_info = &default_backing_dev_info;
-	}
 	mutex_unlock(&raw_mutex);
 
 	blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
diff --git a/fs/aio.c b/fs/aio.c
index 6f13d3fab07f..3bf8b1d250c3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -176,7 +176,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 
 	inode->i_mapping->a_ops = &aio_ctx_aops;
 	inode->i_mapping->private_data = ctx;
-	inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 	inode->i_size = PAGE_SIZE * nr_pages;
 
 	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 026ca7b8431c..a9f92794d7a0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -60,19 +60,6 @@ static void bdev_write_inode(struct inode *inode)
 	spin_unlock(&inode->i_lock);
 }
 
-/*
- * Move the inode from its current bdi to a new bdi.  Make sure the inode
- * is clean before moving so that it doesn't linger on the old bdi.
- */
-static void bdev_inode_switch_bdi(struct inode *inode,
-			struct backing_dev_info *dst)
-{
-	spin_lock(&inode->i_lock);
-	WARN_ON_ONCE(inode->i_state & I_DIRTY);
-	inode->i_data.backing_dev_info = dst;
-	spin_unlock(&inode->i_lock);
-}
-
 /* Kill _all_ buffers and pagecache , dirty or not.. */
 void kill_bdev(struct block_device *bdev)
 {
@@ -589,7 +576,6 @@ struct block_device *bdget(dev_t dev)
 		inode->i_bdev = bdev;
 		inode->i_data.a_ops = &def_blk_aops;
 		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
-		inode->i_data.backing_dev_info = &default_backing_dev_info;
 		spin_lock(&bdev_lock);
 		list_add(&bdev->bd_list, &all_bdevs);
 		spin_unlock(&bdev_lock);
@@ -1150,8 +1136,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
 		if (!partno) {
-			struct backing_dev_info *bdi;
-
 			ret = -ENXIO;
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!bdev->bd_part)
@@ -1177,11 +1161,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				}
 			}
 
-			if (!ret) {
+			if (!ret)
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
-				bdi = blk_get_backing_dev_info(bdev);
-				bdev_inode_switch_bdi(bdev->bd_inode, bdi);
-			}
 
 			/*
 			 * If the device is invalidated, rescan partition
@@ -1208,8 +1189,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			if (ret)
 				goto out_clear;
 			bdev->bd_contains = whole;
-			bdev_inode_switch_bdi(bdev->bd_inode,
-				whole->bd_inode->i_data.backing_dev_info);
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
 			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1249,7 +1228,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
 	bdev->bd_queue = NULL;
-	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;
@@ -1474,8 +1452,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		 * dirty data before.
 		 */
 		bdev_write_inode(bdev->bd_inode);
-		bdev_inode_switch_bdi(bdev->bd_inode,
-					&default_backing_dev_info);
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index afc4092989cd..1ec872e3a926 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2318,7 +2318,6 @@ int open_ctree(struct super_block *sb,
 	 */
 	fs_info->btree_inode->i_size = OFFSET_MAX;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-	fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
 
 	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
 	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bf326affb94..54bcf639d1cf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3608,7 +3608,6 @@ cache_acl:
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
-		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
@@ -3623,7 +3622,6 @@ cache_acl:
 	case S_IFLNK:
 		inode->i_op = &btrfs_symlink_inode_operations;
 		inode->i_mapping->a_ops = &btrfs_symlink_aops;
-		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		break;
 	default:
 		inode->i_op = &btrfs_special_inode_operations;
@@ -6088,7 +6086,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	inode->i_fop = &btrfs_file_operations;
 	inode->i_op = &btrfs_file_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_aops;
-	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 
 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
 	if (err)
@@ -9203,7 +9200,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_fop = &btrfs_file_operations;
 	inode->i_op = &btrfs_file_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_aops;
-	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 
 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
@@ -9247,7 +9243,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 
 	inode->i_op = &btrfs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
-	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	inode_set_bytes(inode, name_len);
 	btrfs_i_size_write(inode, name_len);
 	err = btrfs_update_inode(trans, root, inode);
@@ -9459,7 +9454,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 	inode->i_op = &btrfs_file_inode_operations;
 
 	inode->i_mapping->a_ops = &btrfs_aops;
-	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 
 	ret = btrfs_init_inode_security(trans, inode, dir, NULL);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f61a74115beb..6b5173605154 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -783,8 +783,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 	}
 
 	inode->i_mapping->a_ops = &ceph_aops;
-	inode->i_mapping->backing_dev_info =
-		&ceph_sb_to_client(inode->i_sb)->backing_dev_info;
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFIFO:
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0c3ce464cae4..2d4f37235ed0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -937,8 +937,6 @@ retry_iget5_locked:
 			inode->i_flags |= S_NOATIME | S_NOCMTIME;
 		if (inode->i_state & I_NEW) {
 			inode->i_ino = hash;
-			if (S_ISREG(inode->i_mode))
-				inode->i_data.backing_dev_info = sb->s_bdi;
 #ifdef CONFIG_CIFS_FSCACHE
 			/* initialize per-inode cache cookie pointer */
 			CIFS_I(inode)->fscache = NULL;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 0ad6b4d6de00..65af86147154 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -131,7 +131,6 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode->i_mapping->a_ops = &configfs_aops;
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_op = &configfs_inode_operations;
 
 		if (sd->s_iattr) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1686dc2da9fd..34b36a504059 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -67,7 +67,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
 	inode->i_ino = lower_inode->i_ino;
 	inode->i_version++;
 	inode->i_mapping->a_ops = &ecryptfs_aops;
-	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
 
 	if (S_ISLNK(inode->i_mode))
 		inode->i_op = &ecryptfs_symlink_iops;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f1d3d4eb8c4f..6fc91df99ff8 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1214,7 +1214,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
 	}
 
-	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &exofs_file_inode_operations;
 		inode->i_fop = &exofs_file_operations;
@@ -1314,7 +1313,6 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
 
 	set_obj_2bcreated(oi);
 
-	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = sbi->s_nextid++;
 	inode->i_blkbits = EXOFS_BLKSHIFT;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f38256e4476e..e8799c11424b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -308,7 +308,6 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 		if (!fc->writeback_cache || !S_ISREG(attr->mode))
 			inode->i_flags |= S_NOCMTIME;
 		inode->i_generation = generation;
-		inode->i_data.backing_dev_info = &fc->bdi;
 		fuse_init_inode(inode, attr);
 		unlock_new_inode(inode);
 	} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a23524aa3eac..08ea717981f7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -775,7 +775,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 		mapping->flags = 0;
 		mapping_set_gfp_mask(mapping, GFP_NOFS);
 		mapping->private_data = NULL;
-		mapping->backing_dev_info = s->s_bdi;
 		mapping->writeback_index = 0;
 	}
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8633ad328ee2..efc8e254787c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -112,7 +112,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	mapping->flags = 0;
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
 	mapping->private_data = NULL;
-	mapping->backing_dev_info = sb->s_bdi;
 	mapping->writeback_index = 0;
 
 	spin_lock_init(&sdp->sd_log_lock);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index de7c95c7d840..c274aca8e8dc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -492,7 +492,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
 				&hugetlbfs_i_mmap_rwsem_key);
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_mapping->private_data = resv_map;
 		info = HUGETLBFS_I(inode);
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..e4e8caa7464c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -170,20 +170,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	atomic_set(&mapping->i_mmap_writable, 0);
 	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
 	mapping->private_data = NULL;
-	mapping->backing_dev_info = &default_backing_dev_info;
 	mapping->writeback_index = 0;
-
-	/*
-	 * If the block_device provides a backing_dev_info for client
-	 * inodes then use that.  Otherwise the inode share the bdev's
-	 * backing_dev_info.
-	 */
-	if (sb->s_bdev) {
-		struct backing_dev_info *bdi;
-
-		bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-		mapping->backing_dev_info = bdi;
-	}
 	inode->i_private = NULL;
 	inode->i_mapping = mapping;
 	INIT_HLIST_HEAD(&inode->i_dentry);	/* buggered by rcu freeing */
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 06f06887b2d2..9000874a945b 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -286,7 +286,6 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 	kernfs_get(kn);
 	inode->i_private = kn;
 	inode->i_mapping->a_ops = &kernfs_aops;
-	inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 	inode->i_op = &kernfs_iops;
 
 	set_default_inode_attr(inode, kn->mode);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index a699a3fc62c0..01a9e16e9782 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -267,7 +267,6 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
 	if (inode) {
 		atomic_set(&NCP_FINFO(inode)->opened, info->opened);
 
-		inode->i_mapping->backing_dev_info = sb->s_bdi;
 		inode->i_ino = info->ino;
 		ncp_set_attr(inode, info);
 		if (S_ISREG(inode->i_mode)) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4bffe637ea32..24aac72420f4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -387,7 +387,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 		if (S_ISREG(inode->i_mode)) {
 			inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
 			inode->i_data.a_ops = &nfs_file_aops;
-			inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
 		} else if (S_ISDIR(inode->i_mode)) {
 			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
 			inode->i_fop = &nfs_dir_operations;
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 57ceaf33d177..748ca238915a 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -172,7 +172,6 @@ int nilfs_init_gcinode(struct inode *inode)
 	inode->i_mode = S_IFREG;
 	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
 	inode->i_mapping->a_ops = &empty_aops;
-	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
 
 	ii->i_flags = 0;
 	nilfs_bmap_init_gc(ii->i_bmap);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c4dcd1db57ee..892cf5ffdb8e 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -429,7 +429,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
 
 	inode->i_mode = S_IFREG;
 	mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
-	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
 
 	inode->i_op = &def_mdt_iops;
 	inode->i_fop = &def_mdt_fops;
@@ -457,13 +456,12 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
 			       struct nilfs_shadow_map *shadow)
 {
 	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
-	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
 	address_space_init_once(&shadow->frozen_data);
-	nilfs_mapping_init(&shadow->frozen_data, inode, bdi);
+	nilfs_mapping_init(&shadow->frozen_data, inode);
 	address_space_init_once(&shadow->frozen_btnodes);
-	nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi);
+	nilfs_mapping_init(&shadow->frozen_btnodes, inode);
 	mi->mi_shadow = shadow;
 	return 0;
 }
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index da276640f776..700ecbcca55d 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -461,14 +461,12 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
 	return nc;
 }
 
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
-			struct backing_dev_info *bdi)
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
 {
 	mapping->host = inode;
 	mapping->flags = 0;
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
 	mapping->private_data = NULL;
-	mapping->backing_dev_info = bdi;
 	mapping->a_ops = &empty_aops;
 }
 
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index ef30c5c2426f..a43b8287d012 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -57,8 +57,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_page(struct page *, bool);
 void nilfs_clear_dirty_pages(struct address_space *, bool);
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
-			struct backing_dev_info *bdi);
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 					    sector_t start_blk,
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 3d4bbac36bea..5bc2a1cf73c3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -166,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
 	ii->i_state = 0;
 	ii->i_cno = 0;
 	ii->vfs_inode.i_version = 1;
-	nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi);
+	nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
 	return &ii->vfs_inode;
 }
 
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 6000d3029b26..061ba6a91bf2 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -398,7 +398,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, NULL, mode);
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inc_nlink(inode);
 
@@ -422,7 +421,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 
 	inode->i_ino = get_next_ino();
 	inode_init_owner(inode, parent, mode);
-	inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
 	ip = DLMFS_I(inode);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index ad4d712002f4..889d558b4e05 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -59,7 +59,6 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_mapping->a_ops = &ramfs_aops;
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
 		mapping_set_unevictable(inode->i_mapping);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e98dd88197d5..268733cda397 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -355,9 +355,6 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
 	case ROMFH_REG:
 		i->i_fop = &romfs_ro_fops;
 		i->i_data.a_ops = &romfs_aops;
-		if (i->i_sb->s_mtd)
-			i->i_data.backing_dev_info =
-				i->i_sb->s_mtd->backing_dev_info;
 		if (nextfh & ROMFH_EXEC)
 			mode |= S_IXUGO;
 		break;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ea41649e4ca5..c49b1981ac95 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -108,8 +108,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
 	inode->i_mtime = inode->i_atime = inode->i_ctime =
 			 ubifs_current_time(inode);
 	inode->i_mapping->nrpages = 0;
-	/* Disable readahead */
-	inode->i_mapping->backing_dev_info = &c->bdi;
 
 	switch (mode & S_IFMT) {
 	case S_IFREG:
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index ed93dc6ae245..6197154f36ca 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -156,9 +156,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
 	if (err)
 		goto out_invalid;
 
-	/* Disable read-ahead */
-	inode->i_mapping->backing_dev_info = &c->bdi;
-
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &ubifs_file_address_operations;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1dada399aa23..65d02de342e1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -34,6 +34,7 @@
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
 
+struct backing_dev_info;
 struct export_operations;
 struct hd_geometry;
 struct iovec;
@@ -394,7 +395,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 
-struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
@@ -409,7 +409,6 @@ struct address_space {
 	pgoff_t			writeback_index;/* writeback starts here */
 	const struct address_space_operations *a_ops;	/* methods */
 	unsigned long		flags;		/* error bits/gfp mask */
-	struct backing_dev_info *backing_dev_info; /* device readahead, etc */
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	void			*private_data;	/* ditto */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 16c68958aeda..52e0c7652448 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -24,7 +24,6 @@ struct backing_dev_info noop_backing_dev_info = {
 	.name		= "noop",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
-EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 
 static struct class *bdi_class;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 1b77eaf589fd..4c61d3d5bfb4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1410,7 +1410,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_blocks = 0;
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_generation = get_seconds();
 		info = SHMEM_I(inode);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1c137b69ecde..405923f77334 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,7 +37,6 @@ struct address_space swapper_spaces[MAX_SWAPFILES] = {
 		.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
 		.i_mmap_writable = ATOMIC_INIT(0),
 		.a_ops		= &swap_aops,
-		.backing_dev_info = &noop_backing_dev_info,
 	}
 };
 
-- 
cgit v1.2.3


From df0ce26cb4ee8bc233d50213b97213532aff0a3c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:41 +0100
Subject: fs: remove default_backing_dev_info

Now that default_backing_dev_info is not used for writeback purposes we can
git rid of it easily:

 - instead of using it's name for tracing unregistered bdi we just use
   "unknown"
 - btrfs and ceph can just assign the default read ahead window themselves
   like several other filesystems already do.
 - we can assign noop_backing_dev_info as the default one in alloc_super.
   All filesystems already either assigned their own or
   noop_backing_dev_info.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/btrfs/disk-io.c               | 2 +-
 fs/ceph/super.c                  | 2 +-
 fs/super.c                       | 8 ++------
 include/linux/backing-dev.h      | 1 -
 include/trace/events/writeback.h | 6 ++----
 mm/backing-dev.c                 | 9 ---------
 6 files changed, 6 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1ec872e3a926..1afb18226da8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1719,7 +1719,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 	if (err)
 		return err;
 
-	bdi->ra_pages	= default_backing_dev_info.ra_pages;
+	bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
 	bdi->congested_fn	= btrfs_congested_fn;
 	bdi->congested_data	= info;
 	return 0;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e350cc1611e4..5ae62587a71d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -899,7 +899,7 @@ static int ceph_register_bdi(struct super_block *sb,
 			>> PAGE_SHIFT;
 	else
 		fsc->backing_dev_info.ra_pages =
-			default_backing_dev_info.ra_pages;
+			VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
 
 	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
 			   atomic_long_inc_return(&bdi_seq));
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..3b4dadafdd60 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -185,8 +185,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	}
 	init_waitqueue_head(&s->s_writers.wait);
 	init_waitqueue_head(&s->s_writers.wait_unfrozen);
+	s->s_bdi = &noop_backing_dev_info;
 	s->s_flags = flags;
-	s->s_bdi = &default_backing_dev_info;
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
 	INIT_LIST_HEAD(&s->s_inodes);
@@ -863,10 +863,7 @@ EXPORT_SYMBOL(free_anon_bdev);
 
 int set_anon_super(struct super_block *s, void *data)
 {
-	int error = get_anon_bdev(&s->s_dev);
-	if (!error)
-		s->s_bdi = &noop_backing_dev_info;
-	return error;
+	return get_anon_bdev(&s->s_dev);
 }
 
 EXPORT_SYMBOL(set_anon_super);
@@ -1111,7 +1108,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 	sb = root->d_sb;
 	BUG_ON(!sb);
 	WARN_ON(!sb->s_bdi);
-	WARN_ON(sb->s_bdi == &default_backing_dev_info);
 	sb->s_flags |= MS_BORN;
 
 	error = security_sb_kern_mount(sb, flags, secdata);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index ed59dee03a71..d94077fea1f8 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -241,7 +241,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
 	(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
 
-extern struct backing_dev_info default_backing_dev_info;
 extern struct backing_dev_info noop_backing_dev_info;
 
 int writeback_in_progress(struct backing_dev_info *bdi);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 74f5207bd090..0e9310905413 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -156,10 +156,8 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		__field(int, reason)
 	),
 	TP_fast_assign(
-		struct device *dev = bdi->dev;
-		if (!dev)
-			dev = default_backing_dev_info.dev;
-		strncpy(__entry->name, dev_name(dev), 32);
+		strncpy(__entry->name,
+			bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
 		__entry->nr_pages = work->nr_pages;
 		__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
 		__entry->sync_mode = work->sync_mode;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1725adb242e0..7690ec77c722 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,12 +14,6 @@
 
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 
-struct backing_dev_info default_backing_dev_info = {
-	.name		= "default",
-	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
-};
-EXPORT_SYMBOL_GPL(default_backing_dev_info);
-
 struct backing_dev_info noop_backing_dev_info = {
 	.name		= "noop",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -250,9 +244,6 @@ static int __init default_bdi_init(void)
 	if (!bdi_wq)
 		return -ENOMEM;
 
-	err = bdi_init(&default_backing_dev_info);
-	if (!err)
-		bdi_register(&default_backing_dev_info, NULL, "default");
 	err = bdi_init(&noop_backing_dev_info);
 
 	return err;
-- 
cgit v1.2.3


From d93ba7a5a97c9f315bacdcdb8de4e5f368e7b396 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 20 Jan 2015 20:06:30 -0500
Subject: block: Add discard flag to blkdev_issue_zeroout() function

blkdev_issue_discard() will zero a given block range. This is done by
way of explicit writing, thus provisioning or allocating the blocks on
disk.

There are use cases where the desired behavior is to zero the blocks but
unprovision them if possible. The blocks must deterministically contain
zeroes when they are subsequently read back.

This patch adds a flag to blkdev_issue_zeroout() that provides this
variant. If the discard flag is set and a block device guarantees
discard_zeroes_data we will use REQ_DISCARD to clear the block range. If
the device does not support discard_zeroes_data or if the discard
request fails we will fall back to first REQ_WRITE_SAME and then a
regular REQ_WRITE.

Also update the callers of blkdev_issue_zero() to reflect the new flag
and make sb_issue_zeroout() prefer the discard approach.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-lib.c                    | 30 ++++++++++++++++++++++++++----
 block/ioctl.c                      |  2 +-
 drivers/block/drbd/drbd_receiver.c |  2 +-
 include/linux/blkdev.h             |  4 ++--
 4 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 8411be3c19d3..715e948f58a4 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -283,23 +283,45 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @discard:	whether to discard the block range
  *
  * Description:
- *  Generate and issue number of bios with zerofiled pages.
+
+ *  Zero-fill a block range.  If the discard flag is set and the block
+ *  device guarantees that subsequent READ operations to the block range
+ *  in question will return zeroes, the blocks will be discarded. Should
+ *  the discard request fail, if the discard flag is not set, or if
+ *  discard_zeroes_data is not supported, this function will resort to
+ *  zeroing the blocks manually, thus provisioning (allocating,
+ *  anchoring) them. If the block device supports the WRITE SAME command
+ *  blkdev_issue_zeroout() will use it to optimize the process of
+ *  clearing the block range. Otherwise the zeroing will be performed
+ *  using regular WRITE calls.
  */
 
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			 sector_t nr_sects, gfp_t gfp_mask)
+			 sector_t nr_sects, gfp_t gfp_mask, bool discard)
 {
+	struct request_queue *q = bdev_get_queue(bdev);
+	unsigned char bdn[BDEVNAME_SIZE];
+
+	if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data) {
+
+		if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0))
+			return 0;
+
+		bdevname(bdev, bdn);
+		pr_warn("%s: DISCARD failed. Manually zeroing.\n", bdn);
+	}
+
 	if (bdev_write_same(bdev)) {
-		unsigned char bdn[BDEVNAME_SIZE];
 
 		if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
 					     ZERO_PAGE(0)))
 			return 0;
 
 		bdevname(bdev, bdn);
-		pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn);
+		pr_warn("%s: WRITE SAME failed. Manually zeroing.\n", bdn);
 	}
 
 	return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
diff --git a/block/ioctl.c b/block/ioctl.c
index 6c7bf903742f..7d8befde2aca 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -198,7 +198,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
 	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
 		return -EINVAL;
 
-	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL);
+	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index d169b4a79267..cee20354ac37 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1388,7 +1388,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
 		list_add_tail(&peer_req->w.list, &device->active_ee);
 		spin_unlock_irq(&device->resource->req_lock);
 		if (blkdev_issue_zeroout(device->ldev->backing_bdev,
-			sector, data_size >> 9, GFP_NOIO))
+			sector, data_size >> 9, GFP_NOIO, false))
 			peer_req->flags |= EE_WAS_ERROR;
 		drbd_endio_write_sec_final(peer_req);
 		return 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e9086be6d9a0..4c4b732d7556 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1162,7 +1162,7 @@ extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			sector_t nr_sects, gfp_t gfp_mask);
+		sector_t nr_sects, gfp_t gfp_mask, bool discard);
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
@@ -1176,7 +1176,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 	return blkdev_issue_zeroout(sb->s_bdev,
 				    block << (sb->s_blocksize_bits - 9),
 				    nr_blocks << (sb->s_blocksize_bits - 9),
-				    gfp_mask);
+				    gfp_mask, true);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
-- 
cgit v1.2.3


From 55422d0bd292f5ad143cc32cb8bb8505257274c4 Mon Sep 17 00:00:00 2001
From: Paul Moore <pmoore@redhat.com>
Date: Thu, 22 Jan 2015 00:00:23 -0500
Subject: audit: replace getname()/putname() hacks with reference counters

In order to ensure that filenames are not released before the audit
subsystem is done with the strings there are a number of hacks built
into the fs and audit subsystems around getname() and putname().  To
say these hacks are "ugly" would be kind.

This patch removes the filename hackery in favor of a more
conventional reference count based approach.  The diffstat below tells
most of the story; lots of audit/fs specific code is replaced with a
traditional reference count based approach that is easily understood,
even by those not familiar with the audit and/or fs subsystems.

CC: viro@zeniv.linux.org.uk
CC: linux-fsdevel@vger.kernel.org
Signed-off-by: Paul Moore <pmoore@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            |  29 +++++++-------
 include/linux/audit.h |   3 --
 include/linux/fs.h    |   9 +----
 kernel/audit.h        |  17 +-------
 kernel/auditsc.c      | 105 ++++++--------------------------------------------
 5 files changed, 29 insertions(+), 134 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index a3fde77d4abf..96ca11dea4a2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -118,15 +118,6 @@
  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
  * PATH_MAX includes the nul terminator --RR.
  */
-void final_putname(struct filename *name)
-{
-	if (name->separate) {
-		__putname(name->name);
-		kfree(name);
-	} else {
-		__putname(name);
-	}
-}
 
 #define EMBEDDED_NAME_MAX	(PATH_MAX - sizeof(struct filename))
 
@@ -145,6 +136,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
 	result = __getname();
 	if (unlikely(!result))
 		return ERR_PTR(-ENOMEM);
+	result->refcnt = 1;
 
 	/*
 	 * First, try to embed the struct filename inside the names_cache
@@ -179,6 +171,7 @@ recopy:
 		}
 		result->name = kname;
 		result->separate = true;
+		result->refcnt = 1;
 		max = PATH_MAX;
 		goto recopy;
 	}
@@ -202,7 +195,7 @@ recopy:
 	return result;
 
 error:
-	final_putname(result);
+	putname(result);
 	return err;
 }
 
@@ -243,19 +236,25 @@ getname_kernel(const char * filename)
 	memcpy((char *)result->name, filename, len);
 	result->uptr = NULL;
 	result->aname = NULL;
+	result->refcnt = 1;
 	audit_getname(result);
 
 	return result;
 }
 
-#ifdef CONFIG_AUDITSYSCALL
 void putname(struct filename *name)
 {
-	if (unlikely(!audit_dummy_context()))
-		return audit_putname(name);
-	final_putname(name);
+	BUG_ON(name->refcnt <= 0);
+
+	if (--name->refcnt > 0)
+		return;
+
+	if (name->separate) {
+		__putname(name->name);
+		kfree(name);
+	} else
+		__putname(name);
 }
-#endif
 
 static int check_acl(struct inode *inode, int mask)
 {
diff --git a/include/linux/audit.h b/include/linux/audit.h
index af84234e1f6e..87c2d347d255 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -128,7 +128,6 @@ extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
 extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern struct filename *__audit_reusename(const __user char *uptr);
 extern void __audit_getname(struct filename *name);
-extern void audit_putname(struct filename *name);
 
 #define AUDIT_INODE_PARENT	1	/* dentry represents the parent */
 #define AUDIT_INODE_HIDDEN	2	/* audit record should be hidden */
@@ -353,8 +352,6 @@ static inline struct filename *audit_reusename(const __user char *name)
 }
 static inline void audit_getname(struct filename *name)
 { }
-static inline void audit_putname(struct filename *name)
-{ }
 static inline void __audit_inode(struct filename *name,
 					const struct dentry *dentry,
 					unsigned int flags)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f90c0282c114..b49842fe203f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2080,6 +2080,7 @@ struct filename {
 	const char		*name;	/* pointer to actual string */
 	const __user char	*uptr;	/* original userland pointer */
 	struct audit_names	*aname;
+	int			refcnt;
 	bool			separate; /* should "name" be freed? */
 };
 
@@ -2101,6 +2102,7 @@ extern int filp_close(struct file *, fl_owner_t id);
 extern struct filename *getname_flags(const char __user *, int, int *);
 extern struct filename *getname(const char __user *);
 extern struct filename *getname_kernel(const char *);
+extern void putname(struct filename *name);
 
 enum {
 	FILE_CREATED = 1,
@@ -2121,15 +2123,8 @@ extern void __init vfs_caches_init(unsigned long);
 
 extern struct kmem_cache *names_cachep;
 
-extern void final_putname(struct filename *name);
-
 #define __getname()		kmem_cache_alloc(names_cachep, GFP_KERNEL)
 #define __putname(name)		kmem_cache_free(names_cachep, (void *)(name))
-#ifndef CONFIG_AUDITSYSCALL
-#define putname(name)		final_putname(name)
-#else
-extern void putname(struct filename *name);
-#endif
 
 #ifdef CONFIG_BLOCK
 extern int register_blkdev(unsigned int, const char *);
diff --git a/kernel/audit.h b/kernel/audit.h
index 3cdffad5a1d9..1caa0d345d90 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -24,12 +24,6 @@
 #include <linux/skbuff.h>
 #include <uapi/linux/mqueue.h>
 
-/* 0 = no checking
-   1 = put_count checking
-   2 = verbose put_count checking
-*/
-#define AUDIT_DEBUG 0
-
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
  * for saving names from getname().  If we get more names we will allocate
  * a name dynamically and also add those to the list anchored by names_list. */
@@ -74,9 +68,8 @@ struct audit_cap_data {
 	};
 };
 
-/* When fs/namei.c:getname() is called, we store the pointer in name and
- * we don't let putname() free it (instead we free all of the saved
- * pointers at syscall exit time).
+/* When fs/namei.c:getname() is called, we store the pointer in name and bump
+ * the refcnt in the associated filename struct.
  *
  * Further, in fs/namei.c:path_lookup() we store the inode and device.
  */
@@ -86,7 +79,6 @@ struct audit_names {
 	struct filename		*name;
 	int			name_len;	/* number of chars to log */
 	bool			hidden;		/* don't log this record */
-	bool			name_put;	/* call __putname()? */
 
 	unsigned long		ino;
 	dev_t			dev;
@@ -208,11 +200,6 @@ struct audit_context {
 	};
 	int fds[2];
 	struct audit_proctitle proctitle;
-
-#if AUDIT_DEBUG
-	int		    put_count;
-	int		    ino_count;
-#endif
 };
 
 extern u32 audit_ever_enabled;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4f521964ccaa..0c38604a630c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -866,33 +866,10 @@ static inline void audit_free_names(struct audit_context *context)
 {
 	struct audit_names *n, *next;
 
-#if AUDIT_DEBUG == 2
-	if (context->put_count + context->ino_count != context->name_count) {
-		int i = 0;
-
-		pr_err("%s:%d(:%d): major=%d in_syscall=%d"
-		       " name_count=%d put_count=%d ino_count=%d"
-		       " [NOT freeing]\n", __FILE__, __LINE__,
-		       context->serial, context->major, context->in_syscall,
-		       context->name_count, context->put_count,
-		       context->ino_count);
-		list_for_each_entry(n, &context->names_list, list) {
-			pr_err("names[%d] = %p = %s\n", i++, n->name,
-			       n->name->name ?: "(null)");
-		}
-		dump_stack();
-		return;
-	}
-#endif
-#if AUDIT_DEBUG
-	context->put_count  = 0;
-	context->ino_count  = 0;
-#endif
-
 	list_for_each_entry_safe(n, next, &context->names_list, list) {
 		list_del(&n->list);
-		if (n->name && n->name_put)
-			final_putname(n->name);
+		if (n->name)
+			putname(n->name);
 		if (n->should_free)
 			kfree(n);
 	}
@@ -1711,9 +1688,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
 	list_add_tail(&aname->list, &context->names_list);
 
 	context->name_count++;
-#if AUDIT_DEBUG
-	context->ino_count++;
-#endif
 	return aname;
 }
 
@@ -1734,8 +1708,10 @@ __audit_reusename(const __user char *uptr)
 	list_for_each_entry(n, &context->names_list, list) {
 		if (!n->name)
 			continue;
-		if (n->name->uptr == uptr)
+		if (n->name->uptr == uptr) {
+			n->name->refcnt++;
 			return n->name;
+		}
 	}
 	return NULL;
 }
@@ -1752,19 +1728,8 @@ void __audit_getname(struct filename *name)
 	struct audit_context *context = current->audit_context;
 	struct audit_names *n;
 
-	if (!context->in_syscall) {
-#if AUDIT_DEBUG == 2
-		pr_err("%s:%d(:%d): ignoring getname(%p)\n",
-		       __FILE__, __LINE__, context->serial, name);
-		dump_stack();
-#endif
+	if (!context->in_syscall)
 		return;
-	}
-
-#if AUDIT_DEBUG
-	/* The filename _must_ have a populated ->name */
-	BUG_ON(!name->name);
-#endif
 
 	n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
 	if (!n)
@@ -1772,56 +1737,13 @@ void __audit_getname(struct filename *name)
 
 	n->name = name;
 	n->name_len = AUDIT_NAME_FULL;
-	n->name_put = true;
 	name->aname = n;
+	name->refcnt++;
 
 	if (!context->pwd.dentry)
 		get_fs_pwd(current->fs, &context->pwd);
 }
 
-/* audit_putname - intercept a putname request
- * @name: name to intercept and delay for putname
- *
- * If we have stored the name from getname in the audit context,
- * then we delay the putname until syscall exit.
- * Called from include/linux/fs.h:putname().
- */
-void audit_putname(struct filename *name)
-{
-	struct audit_context *context = current->audit_context;
-
-	BUG_ON(!context);
-	if (!name->aname || !context->in_syscall) {
-#if AUDIT_DEBUG == 2
-		pr_err("%s:%d(:%d): final_putname(%p)\n",
-		       __FILE__, __LINE__, context->serial, name);
-		if (context->name_count) {
-			struct audit_names *n;
-			int i = 0;
-
-			list_for_each_entry(n, &context->names_list, list)
-				pr_err("name[%d] = %p = %s\n", i++, n->name,
-				       n->name->name ?: "(null)");
-			}
-#endif
-		final_putname(name);
-	}
-#if AUDIT_DEBUG
-	else {
-		++context->put_count;
-		if (context->put_count > context->name_count) {
-			pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
-			       " name_count=%d put_count=%d\n",
-			       __FILE__, __LINE__,
-			       context->serial, context->major,
-			       context->in_syscall, name->name,
-			       context->name_count, context->put_count);
-			dump_stack();
-		}
-	}
-#endif
-}
-
 /**
  * __audit_inode - store the inode and device from a lookup
  * @name: name being audited
@@ -1842,11 +1764,6 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
 	if (!name)
 		goto out_alloc;
 
-#if AUDIT_DEBUG
-	/* The struct filename _must_ have a populated ->name */
-	BUG_ON(!name->name);
-#endif
-
 	/*
 	 * If we have a pointer to an audit_names entry already, then we can
 	 * just use it directly if the type is correct.
@@ -1893,9 +1810,10 @@ out_alloc:
 	n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
 	if (!n)
 		return;
-	if (name)
-		/* no need to set ->name_put as the original will cleanup */
+	if (name) {
 		n->name = name;
+		name->refcnt++;
+	}
 
 out:
 	if (parent) {
@@ -2000,8 +1918,7 @@ void __audit_inode_child(const struct inode *parent,
 		if (found_parent) {
 			found_child->name = found_parent->name;
 			found_child->name_len = AUDIT_NAME_FULL;
-			/* don't call __putname() */
-			found_child->name_put = false;
+			found_child->name->refcnt++;
 		}
 	}
 
-- 
cgit v1.2.3


From 536e1715226c94037df12f7c6280cbe0f6009f92 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 5 Nov 2014 11:43:26 +0100
Subject: gpu: host1x: Call ->remove() only when a device is bound

When a driver's ->probe() function fails, the host1x bus must not call
its ->remove() function because the driver will already have cleaned up
in the error handling path in ->probe().

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/bus.c | 9 +++++++--
 include/linux/host1x.h   | 2 ++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index aaf54859adb0..e4182e68e29c 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -116,7 +116,10 @@ static void host1x_subdev_register(struct host1x_device *device,
 	if (list_empty(&device->subdevs)) {
 		err = device->driver->probe(device);
 		if (err < 0)
-			dev_err(&device->dev, "probe failed: %d\n", err);
+			dev_err(&device->dev, "probe failed for %ps: %d\n",
+				device->driver, err);
+		else
+			device->bound = true;
 	}
 }
 
@@ -130,10 +133,12 @@ static void __host1x_subdev_unregister(struct host1x_device *device,
 	 * If all subdevices have been activated, we're about to remove the
 	 * first active subdevice, so unload the driver first.
 	 */
-	if (list_empty(&device->subdevs)) {
+	if (list_empty(&device->subdevs) && device->bound) {
 		err = device->driver->remove(device);
 		if (err < 0)
 			dev_err(&device->dev, "remove failed: %d\n", err);
+
+		device->bound = false;
 	}
 
 	/*
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index bb9840fd1e18..7890b553d12e 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -272,6 +272,8 @@ struct host1x_device {
 
 	struct mutex clients_lock;
 	struct list_head clients;
+
+	bool bound;
 };
 
 static inline struct host1x_device *to_host1x_device(struct device *dev)
-- 
cgit v1.2.3


From 31928aa5863e71535ee942f506ca9ac8ce1c4315 Mon Sep 17 00:00:00 2001
From: Dominik Dingel <dingel@linux.vnet.ibm.com>
Date: Thu, 4 Dec 2014 15:47:07 +0100
Subject: KVM: remove unneeded return value of vcpu_postcreate

The return value of kvm_arch_vcpu_postcreate is not checked in its
caller.  This is okay, because only x86 provides vcpu_postcreate right
now and it could only fail if vcpu_load failed.  But that is not
possible during KVM_CREATE_VCPU (kvm_arch_vcpu_load is void, too), so
just get rid of the unchecked return value.

Signed-off-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/arm/kvm/arm.c         |  3 +--
 arch/mips/kvm/mips.c       |  3 +--
 arch/powerpc/kvm/powerpc.c |  3 +--
 arch/s390/kvm/kvm-s390.c   |  3 +--
 arch/x86/kvm/x86.c         | 10 +++-------
 include/linux/kvm_host.h   |  2 +-
 6 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 2d6d91001062..1a10e0ce9266 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -241,9 +241,8 @@ out:
 	return ERR_PTR(err);
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	return 0;
 }
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index e3b21e51ff7e..7082481cd108 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -832,9 +832,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	return -ENOIOCTLCMD;
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c45eaab752b0..27c0face86f4 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -623,9 +623,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	return vcpu;
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	return 0;
 }
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3e09801e3104..ec004f80ee45 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -615,9 +615,8 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
 	kvm_s390_clear_local_irqs(vcpu);
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	return 0;
 }
 
 static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 49ecda7ca958..274fbcbcc180 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7056,15 +7056,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	return r;
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	int r;
 	struct msr_data msr;
 	struct kvm *kvm = vcpu->kvm;
 
-	r = vcpu_load(vcpu);
-	if (r)
-		return r;
+	if (vcpu_load(vcpu))
+		return;
 	msr.data = 0x0;
 	msr.index = MSR_IA32_TSC;
 	msr.host_initiated = true;
@@ -7073,8 +7071,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 					KVMCLOCK_SYNC_PERIOD);
-
-	return r;
 }
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 26f106022c88..a82432c710c9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -652,7 +652,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
 
 int kvm_arch_hardware_enable(void);
-- 
cgit v1.2.3


From 3c5199143bc4b35f472c5c2534026d74821e2044 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Thu, 22 Jan 2015 08:19:32 -0500
Subject: sunrpc/lockd: fix references to the BKL

The BKL is completely out of the picture in the lockd and sunrpc code
these days. Update the antiquated comments that refer to it.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svclock.c         | 4 ++--
 include/linux/sunrpc/svc.h | 2 +-
 net/sunrpc/svc.c           | 4 ++--
 net/sunrpc/svc_xprt.c      | 3 +--
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 56598742dde4..5581e020644b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -57,8 +57,8 @@ static DEFINE_SPINLOCK(nlm_blocked_lock);
 static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
 {
 	/*
-	 * We can get away with a static buffer because we're only
-	 * called with BKL held.
+	 * We can get away with a static buffer because this is only called
+	 * from lockd, which is single-threaded.
 	 */
 	static char buf[2*NLM_MAXCOOKIELEN+1];
 	unsigned int i, len = sizeof(buf);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 6f22cfeef5e3..fae6fb947fc8 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -110,7 +110,7 @@ struct svc_serv {
  * We use sv_nrthreads as a reference count.  svc_destroy() drops
  * this refcount, so we need to bump it up around operations that
  * change the number of threads.  Horrible, but there it is.
- * Should be called with the BKL held.
+ * Should be called with the "service mutex" held.
  */
 static inline void svc_get(struct svc_serv *serv)
 {
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 91eaef1844c8..78974e4d9ad2 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -768,8 +768,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 EXPORT_SYMBOL_GPL(svc_set_num_threads);
 
 /*
- * Called from a server thread as it's exiting. Caller must hold the BKL or
- * the "service mutex", whichever is appropriate for the service.
+ * Called from a server thread as it's exiting. Caller must hold the "service
+ * mutex" for the service.
  */
 void
 svc_exit_thread(struct svc_rqst *rqstp)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c69358b3cf7f..163ac45c3639 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -42,7 +42,7 @@ static LIST_HEAD(svc_xprt_class_list);
  *	svc_pool->sp_lock protects most of the fields of that pool.
  *	svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
  *	when both need to be taken (rare), svc_serv->sv_lock is first.
- *	BKL protects svc_serv->sv_nrthread.
+ *	The "service mutex" protects svc_serv->sv_nrthread.
  *	svc_sock->sk_lock protects the svc_sock->sk_deferred list
  *             and the ->sk_info_authunix cache.
  *
@@ -67,7 +67,6 @@ static LIST_HEAD(svc_xprt_class_list);
  *		  that no other thread will be using the transport or will
  *		  try to set XPT_DEAD.
  */
-
 int svc_reg_xprt_class(struct svc_xprt_class *xcl)
 {
 	struct svc_xprt_class *cl;
-- 
cgit v1.2.3


From ee1b6f7aff94019c09e73837054979063f722046 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 15 Jan 2015 17:32:25 -0800
Subject: block: support different tag allocation policy

The libata tag allocation is using a round-robin policy. Next patch will
make libata use block generic tag allocation, so let's add a policy to
tag allocation.

Currently two policies: FIFO (default) and round-robin.

Cc: Jens Axboe <axboe@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-tag.c          | 33 +++++++++++++++++++++++++--------
 drivers/block/osdblk.c   |  2 +-
 drivers/scsi/scsi_scan.c |  3 ++-
 include/linux/blkdev.h   |  8 ++++++--
 include/scsi/scsi_host.h |  3 +++
 include/scsi/scsi_tcq.h  |  3 ++-
 6 files changed, 39 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-tag.c b/block/blk-tag.c
index a185b86741e5..f0344e6939d5 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -119,7 +119,7 @@ fail:
 }
 
 static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
-						   int depth)
+						int depth, int alloc_policy)
 {
 	struct blk_queue_tag *tags;
 
@@ -131,6 +131,8 @@ static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
 		goto fail;
 
 	atomic_set(&tags->refcnt, 1);
+	tags->alloc_policy = alloc_policy;
+	tags->next_tag = 0;
 	return tags;
 fail:
 	kfree(tags);
@@ -140,10 +142,11 @@ fail:
 /**
  * blk_init_tags - initialize the tag info for an external tag map
  * @depth:	the maximum queue depth supported
+ * @alloc_policy: tag allocation policy
  **/
-struct blk_queue_tag *blk_init_tags(int depth)
+struct blk_queue_tag *blk_init_tags(int depth, int alloc_policy)
 {
-	return __blk_queue_init_tags(NULL, depth);
+	return __blk_queue_init_tags(NULL, depth, alloc_policy);
 }
 EXPORT_SYMBOL(blk_init_tags);
 
@@ -152,19 +155,20 @@ EXPORT_SYMBOL(blk_init_tags);
  * @q:  the request queue for the device
  * @depth:  the maximum queue depth supported
  * @tags: the tag to use
+ * @alloc_policy: tag allocation policy
  *
  * Queue lock must be held here if the function is called to resize an
  * existing map.
  **/
 int blk_queue_init_tags(struct request_queue *q, int depth,
-			struct blk_queue_tag *tags)
+			struct blk_queue_tag *tags, int alloc_policy)
 {
 	int rc;
 
 	BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
 
 	if (!tags && !q->queue_tags) {
-		tags = __blk_queue_init_tags(q, depth);
+		tags = __blk_queue_init_tags(q, depth, alloc_policy);
 
 		if (!tags)
 			return -ENOMEM;
@@ -344,9 +348,21 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	}
 
 	do {
-		tag = find_first_zero_bit(bqt->tag_map, max_depth);
-		if (tag >= max_depth)
-			return 1;
+		if (bqt->alloc_policy == BLK_TAG_ALLOC_FIFO) {
+			tag = find_first_zero_bit(bqt->tag_map, max_depth);
+			if (tag >= max_depth)
+				return 1;
+		} else {
+			int start = bqt->next_tag;
+			int size = min_t(int, bqt->max_depth, max_depth + start);
+			tag = find_next_zero_bit(bqt->tag_map, size, start);
+			if (tag >= size && start + size > bqt->max_depth) {
+				size = start + size - bqt->max_depth;
+				tag = find_first_zero_bit(bqt->tag_map, size);
+			}
+			if (tag >= size)
+				return 1;
+		}
 
 	} while (test_and_set_bit_lock(tag, bqt->tag_map));
 	/*
@@ -354,6 +370,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	 * See blk_queue_end_tag for details.
 	 */
 
+	bqt->next_tag = (tag + 1) % bqt->max_depth;
 	rq->cmd_flags |= REQ_QUEUED;
 	rq->tag = tag;
 	bqt->tag_index[tag] = rq;
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 79aa179305b5..e22942596207 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -423,7 +423,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
 	}
 
 	/* switch queue to TCQ mode; allocate tag map */
-	rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL);
+	rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO);
 	if (rc) {
 		blk_cleanup_queue(q);
 		put_disk(disk);
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 983aed10ff2f..921a8c897eb2 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -290,7 +290,8 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 	if (!shost_use_blk_mq(sdev->host) &&
 	    (shost->bqt || shost->hostt->use_blk_tags)) {
 		blk_queue_init_tags(sdev->request_queue,
-				    sdev->host->cmd_per_lun, shost->bqt);
+				    sdev->host->cmd_per_lun, shost->bqt,
+				    shost->hostt->tag_alloc_policy);
 	}
 	scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4c4b732d7556..6f388fd1c11c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -272,7 +272,11 @@ struct blk_queue_tag {
 	int max_depth;			/* what we will send to device */
 	int real_max_depth;		/* what the array can hold */
 	atomic_t refcnt;		/* map can be shared */
+	int alloc_policy;		/* tag allocation policy */
+	int next_tag;			/* next tag */
 };
+#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
+#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
 
 #define BLK_SCSI_MAX_CMDS	(256)
 #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
@@ -1139,11 +1143,11 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 extern int blk_queue_start_tag(struct request_queue *, struct request *);
 extern struct request *blk_queue_find_tag(struct request_queue *, int);
 extern void blk_queue_end_tag(struct request_queue *, struct request *);
-extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *);
+extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int);
 extern void blk_queue_free_tags(struct request_queue *);
 extern int blk_queue_resize_tags(struct request_queue *, int);
 extern void blk_queue_invalidate_tags(struct request_queue *);
-extern struct blk_queue_tag *blk_init_tags(int);
+extern struct blk_queue_tag *blk_init_tags(int, int);
 extern void blk_free_tags(struct blk_queue_tag *);
 
 static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 019e66858ce6..e113c757d555 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -402,6 +402,9 @@ struct scsi_host_template {
 	 */
 	unsigned char present;
 
+	/* If use block layer to manage tags, this is tag allocation policy */
+	int tag_alloc_policy;
+
 	/*
 	 * Let the block layer assigns tags to all commands.
 	 */
diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h
index 9708b28bd2aa..b27977e8aaed 100644
--- a/include/scsi/scsi_tcq.h
+++ b/include/scsi/scsi_tcq.h
@@ -66,7 +66,8 @@ static inline int scsi_init_shared_tag_map(struct Scsi_Host *shost, int depth)
 	 * devices on the shared host (for libata)
 	 */
 	if (!shost->bqt) {
-		shost->bqt = blk_init_tags(depth);
+		shost->bqt = blk_init_tags(depth,
+			shost->hostt->tag_alloc_policy);
 		if (!shost->bqt)
 			return -ENOMEM;
 	}
-- 
cgit v1.2.3


From 24391c0dc57c3756a219defaa781e68637d6ab7d Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 23 Jan 2015 14:18:00 -0700
Subject: blk-mq: add tag allocation policy

This is the blk-mq part to support tag allocation policy. The default
allocation policy isn't changed (though it's not a strict FIFO). The new
policy is round-robin for libata. But it's a try-best implementation. If
multiple tasks are competing, the tags returned will be mixed (which is
unavoidable even with !mq, as requests from different tasks can be
mixed in queue)

Cc: Jens Axboe <axboe@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c      | 39 ++++++++++++++++++++++++---------------
 block/blk-mq-tag.h      |  4 +++-
 block/blk-mq.c          |  3 ++-
 drivers/scsi/scsi_lib.c |  2 ++
 include/linux/blk-mq.h  |  8 ++++++++
 5 files changed, 39 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index d4daee385a23..e3387a74a9a2 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -140,7 +140,8 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	return atomic_read(&hctx->nr_active) < depth;
 }
 
-static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
+static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag,
+			 bool nowrap)
 {
 	int tag, org_last_tag = last_tag;
 
@@ -152,7 +153,7 @@ static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
 			 * offset to 0 in a failure case, so start from 0 to
 			 * exhaust the map.
 			 */
-			if (org_last_tag && last_tag) {
+			if (org_last_tag && last_tag && !nowrap) {
 				last_tag = org_last_tag = 0;
 				continue;
 			}
@@ -170,6 +171,8 @@ static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
 	return tag;
 }
 
+#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR)
+
 /*
  * Straight forward bitmap tag implementation, where each bit is a tag
  * (cleared == free, and set == busy). The small twist is using per-cpu
@@ -182,7 +185,7 @@ static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
  * until the map is exhausted.
  */
 static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
-		    unsigned int *tag_cache)
+		    unsigned int *tag_cache, struct blk_mq_tags *tags)
 {
 	unsigned int last_tag, org_last_tag;
 	int index, i, tag;
@@ -194,7 +197,8 @@ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
 	index = TAG_TO_INDEX(bt, last_tag);
 
 	for (i = 0; i < bt->map_nr; i++) {
-		tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag));
+		tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag),
+				    BT_ALLOC_RR(tags));
 		if (tag != -1) {
 			tag += (index << bt->bits_per_word);
 			goto done;
@@ -221,7 +225,7 @@ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
 	 * up using the specific cached tag.
 	 */
 done:
-	if (tag == org_last_tag) {
+	if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) {
 		last_tag = tag + 1;
 		if (last_tag >= bt->depth - 1)
 			last_tag = 0;
@@ -250,13 +254,13 @@ static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
 static int bt_get(struct blk_mq_alloc_data *data,
 		struct blk_mq_bitmap_tags *bt,
 		struct blk_mq_hw_ctx *hctx,
-		unsigned int *last_tag)
+		unsigned int *last_tag, struct blk_mq_tags *tags)
 {
 	struct bt_wait_state *bs;
 	DEFINE_WAIT(wait);
 	int tag;
 
-	tag = __bt_get(hctx, bt, last_tag);
+	tag = __bt_get(hctx, bt, last_tag, tags);
 	if (tag != -1)
 		return tag;
 
@@ -267,7 +271,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 	do {
 		prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		tag = __bt_get(hctx, bt, last_tag);
+		tag = __bt_get(hctx, bt, last_tag, tags);
 		if (tag != -1)
 			break;
 
@@ -282,7 +286,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 		 * Retry tag allocation after running the hardware queue,
 		 * as running the queue may also have found completions.
 		 */
-		tag = __bt_get(hctx, bt, last_tag);
+		tag = __bt_get(hctx, bt, last_tag, tags);
 		if (tag != -1)
 			break;
 
@@ -313,7 +317,7 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	int tag;
 
 	tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
-			&data->ctx->last_tag);
+			&data->ctx->last_tag, data->hctx->tags);
 	if (tag >= 0)
 		return tag + data->hctx->tags->nr_reserved_tags;
 
@@ -329,7 +333,8 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
 		return BLK_MQ_TAG_FAIL;
 	}
 
-	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero);
+	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero,
+		data->hctx->tags);
 	if (tag < 0)
 		return BLK_MQ_TAG_FAIL;
 
@@ -401,7 +406,8 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
 
 		BUG_ON(real_tag >= tags->nr_tags);
 		bt_clear_tag(&tags->bitmap_tags, real_tag);
-		*last_tag = real_tag;
+		if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO))
+			*last_tag = real_tag;
 	} else {
 		BUG_ON(tag >= tags->nr_reserved_tags);
 		bt_clear_tag(&tags->breserved_tags, tag);
@@ -538,10 +544,12 @@ static void bt_free(struct blk_mq_bitmap_tags *bt)
 }
 
 static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
-						   int node)
+						   int node, int alloc_policy)
 {
 	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
 
+	tags->alloc_policy = alloc_policy;
+
 	if (bt_alloc(&tags->bitmap_tags, depth, node, false))
 		goto enomem;
 	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
@@ -555,7 +563,8 @@ enomem:
 }
 
 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
-				     unsigned int reserved_tags, int node)
+				     unsigned int reserved_tags,
+				     int node, int alloc_policy)
 {
 	struct blk_mq_tags *tags;
 
@@ -571,7 +580,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
 
-	return blk_mq_init_bitmap_tags(tags, node);
+	return blk_mq_init_bitmap_tags(tags, node, alloc_policy);
 }
 
 void blk_mq_free_tags(struct blk_mq_tags *tags)
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index a6fa0fc9d41a..90767b370308 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -42,10 +42,12 @@ struct blk_mq_tags {
 
 	struct request **rqs;
 	struct list_head page_list;
+
+	int alloc_policy;
 };
 
 
-extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
+extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy);
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a7d4a988516f..eb8e694fda06 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1374,7 +1374,8 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 	size_t rq_size, left;
 
 	tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
-				set->numa_node);
+				set->numa_node,
+				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
 	if (!tags)
 		return NULL;
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9ea95dd3e260..49ab11508286 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2188,6 +2188,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
 	shost->tag_set.cmd_size = cmd_size;
 	shost->tag_set.numa_node = NUMA_NO_NODE;
 	shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	shost->tag_set.flags |=
+		BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
 	shost->tag_set.driver_data = shost;
 
 	return blk_mq_alloc_tag_set(&shost->tag_set);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5b6500c77ed2..86b08b1a5eba 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -147,6 +147,8 @@ enum {
 	BLK_MQ_F_SG_MERGE	= 1 << 2,
 	BLK_MQ_F_SYSFS_UP	= 1 << 3,
 	BLK_MQ_F_DEFER_ISSUE	= 1 << 4,
+	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
+	BLK_MQ_F_ALLOC_POLICY_BITS = 1,
 
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
@@ -155,6 +157,12 @@ enum {
 
 	BLK_MQ_CPU_WORK_BATCH	= 8,
 };
+#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
+	((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \
+		((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1))
+#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \
+	((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
+		<< BLK_MQ_F_ALLOC_POLICY_START_BIT)
 
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 void blk_mq_finish_init(struct request_queue *q);
-- 
cgit v1.2.3


From 12cb5ce101abfaf74421f8cc9f196e708209eb79 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 15 Jan 2015 17:32:27 -0800
Subject: libata: use blk taging

libata uses its own tag management which is duplication and the
implementation is poor. And if we switch to blk-mq, tag is build-in.
It's time to switch to generic taging.

The SAS driver has its own tag management, and looks we can't directly
map the host controler tag to SATA tag. So I just bypassed the SAS case.

I changed the code/variable name for the tag management of libata to
make it self contained. Only sas will use it. Later if libsas implements
its tag management, the tag management code in libata can be deleted
easily.

Cc: Jens Axboe <axboe@fb.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/ata/libata-core.c | 54 +++++++++++++++++++++++++++++++++++------------
 drivers/ata/libata-scsi.c |  4 +++-
 drivers/ata/libata.h      |  2 +-
 include/linux/libata.h    |  5 +++--
 4 files changed, 48 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 5c84fb5c3372..695d33df3df5 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1525,6 +1525,15 @@ static void ata_qc_complete_internal(struct ata_queued_cmd *qc)
 	complete(waiting);
 }
 
+static bool ata_valid_internal_tag(struct ata_port *ap, struct ata_device *dev,
+				   unsigned int tag)
+{
+	if (!ap->scsi_host)
+		return !test_and_set_bit(tag, &ap->sas_tag_allocated);
+	return !dev->sdev ||
+	       !blk_queue_find_tag(dev->sdev->request_queue, tag);
+}
+
 /**
  *	ata_exec_internal_sg - execute libata internal command
  *	@dev: Device to which the command is sent
@@ -1585,8 +1594,7 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
 	else
 		tag = 0;
 
-	if (test_and_set_bit(tag, &ap->qc_allocated))
-		BUG();
+	BUG_ON(!ata_valid_internal_tag(ap, dev, tag));
 	qc = __ata_qc_from_tag(ap, tag);
 
 	qc->tag = tag;
@@ -4737,27 +4745,23 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
  *	None.
  */
 
-static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
+static struct ata_queued_cmd *sas_ata_qc_new(struct ata_port *ap)
 {
 	struct ata_queued_cmd *qc = NULL;
 	unsigned int max_queue = ap->host->n_tags;
 	unsigned int i, tag;
 
-	/* no command while frozen */
-	if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
-		return NULL;
-
-	for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) {
+	for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) {
 		tag = tag < max_queue ? tag : 0;
 
 		/* the last tag is reserved for internal command. */
 		if (tag == ATA_TAG_INTERNAL)
 			continue;
 
-		if (!test_and_set_bit(tag, &ap->qc_allocated)) {
+		if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) {
 			qc = __ata_qc_from_tag(ap, tag);
 			qc->tag = tag;
-			ap->last_tag = tag;
+			ap->sas_last_tag = tag;
 			break;
 		}
 	}
@@ -4765,6 +4769,24 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
 	return qc;
 }
 
+static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap, int blktag)
+{
+	struct ata_queued_cmd *qc;
+
+	/* no command while frozen */
+	if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
+		return NULL;
+
+	/* SATA will directly use block tag. libsas need its own tag management */
+	if (ap->scsi_host) {
+		qc = __ata_qc_from_tag(ap, blktag);
+		qc->tag = blktag;
+		return qc;
+	}
+
+	return sas_ata_qc_new(ap);
+}
+
 /**
  *	ata_qc_new_init - Request an available ATA command, and initialize it
  *	@dev: Device from whom we request an available command structure
@@ -4773,12 +4795,12 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
  *	None.
  */
 
-struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
+struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int blktag)
 {
 	struct ata_port *ap = dev->link->ap;
 	struct ata_queued_cmd *qc;
 
-	qc = ata_qc_new(ap);
+	qc = ata_qc_new(ap, blktag);
 	if (qc) {
 		qc->scsicmd = NULL;
 		qc->ap = ap;
@@ -4800,6 +4822,12 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
  *	LOCKING:
  *	spin_lock_irqsave(host lock)
  */
+static void sas_ata_qc_free(unsigned int tag, struct ata_port *ap)
+{
+	if (!ap->scsi_host)
+		clear_bit(tag, &ap->sas_tag_allocated);
+}
+
 void ata_qc_free(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap;
@@ -4812,7 +4840,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
 	tag = qc->tag;
 	if (likely(ata_tag_valid(tag))) {
 		qc->tag = ATA_TAG_POISON;
-		clear_bit(tag, &ap->qc_allocated);
+		sas_ata_qc_free(tag, ap);
 	}
 }
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index e364e86e84d7..94339c2aed1b 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -756,7 +756,7 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
 {
 	struct ata_queued_cmd *qc;
 
-	qc = ata_qc_new_init(dev);
+	qc = ata_qc_new_init(dev, cmd->request->tag);
 	if (qc) {
 		qc->scsicmd = cmd;
 		qc->scsidone = cmd->scsi_done;
@@ -3666,6 +3666,8 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht)
 		 */
 		shost->max_host_blocked = 1;
 
+		scsi_init_shared_tag_map(shost, host->n_tags);
+
 		rc = scsi_add_host_with_dma(ap->scsi_host,
 						&ap->tdev, ap->host->dev);
 		if (rc)
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 5f4e0cca56ec..40405135bbb6 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -63,7 +63,7 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev);
 extern void ata_force_cbl(struct ata_port *ap);
 extern u64 ata_tf_to_lba(const struct ata_taskfile *tf);
 extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf);
-extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev);
+extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag);
 extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
 			   u64 block, u32 n_block, unsigned int tf_flags,
 			   unsigned int tag);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2d182413b1db..f23454762717 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -821,10 +821,10 @@ struct ata_port {
 	unsigned int		cbl;	/* cable type; ATA_CBL_xxx */
 
 	struct ata_queued_cmd	qcmd[ATA_MAX_QUEUE];
-	unsigned long		qc_allocated;
+	unsigned long		sas_tag_allocated; /* for sas tag allocation only */
 	unsigned int		qc_active;
 	int			nr_active_links; /* #links with active qcs */
-	unsigned int		last_tag;	/* track next tag hw expects */
+	unsigned int		sas_last_tag;	/* track next tag hw expects */
 
 	struct ata_link		link;		/* host default link */
 	struct ata_link		*slave_link;	/* see ata_slave_link_init() */
@@ -1344,6 +1344,7 @@ extern struct device_attribute *ata_common_sdev_attrs[];
 	.ioctl			= ata_scsi_ioctl,		\
 	.queuecommand		= ata_scsi_queuecmd,		\
 	.can_queue		= ATA_DEF_QUEUE,		\
+	.tag_alloc_policy	= BLK_TAG_ALLOC_RR,		\
 	.this_id		= ATA_SHT_THIS_ID,		\
 	.cmd_per_lun		= ATA_SHT_CMD_PER_LUN,		\
 	.emulated		= ATA_SHT_EMULATED,		\
-- 
cgit v1.2.3


From c99e76c55f68eaa0c307ba25803c4e59c2fca1ca Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Date: Mon, 12 Jan 2015 16:05:52 +0100
Subject: USB: host: Introduce flag to enable use of 64-bit dma_mask for
 ehci-platform

ehci-octeon driver used a 64-bit dma_mask. With removal of ehci-octeon
and usage of ehci-platform ehci dma_mask is now limited to 32 bits
(coerced in ehci_platform_probe).

Provide a flag in ehci platform data to allow use of 64 bits for
dma_mask.

Cc: David Daney <david.daney@cavium.com>
Cc: Alex Smith <alex.smith@imgtec.com>
Signed-off-by: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Tested-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/cavium-octeon/octeon-platform.c | 4 +---
 drivers/usb/host/ehci-platform.c          | 3 ++-
 include/linux/usb/ehci_pdriver.h          | 1 +
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/cavium-octeon/octeon-platform.c b/arch/mips/cavium-octeon/octeon-platform.c
index eea60b6c927b..12410a2788d8 100644
--- a/arch/mips/cavium-octeon/octeon-platform.c
+++ b/arch/mips/cavium-octeon/octeon-platform.c
@@ -310,6 +310,7 @@ static struct usb_ehci_pdata octeon_ehci_pdata = {
 #ifdef __BIG_ENDIAN
 	.big_endian_mmio	= 1,
 #endif
+	.dma_mask_64	= 1,
 	.power_on	= octeon_ehci_power_on,
 	.power_off	= octeon_ehci_power_off,
 };
@@ -331,8 +332,6 @@ static void __init octeon_ehci_hw_start(struct device *dev)
 	octeon2_usb_clocks_stop();
 }
 
-static u64 octeon_ehci_dma_mask = DMA_BIT_MASK(64);
-
 static int __init octeon_ehci_device_init(void)
 {
 	struct platform_device *pd;
@@ -347,7 +346,6 @@ static int __init octeon_ehci_device_init(void)
 	if (!pd)
 		return 0;
 
-	pd->dev.dma_mask = &octeon_ehci_dma_mask;
 	pd->dev.platform_data = &octeon_ehci_pdata;
 	octeon_ehci_hw_start(&pd->dev);
 
diff --git a/drivers/usb/host/ehci-platform.c b/drivers/usb/host/ehci-platform.c
index 28aae64a8bee..63f2622926c4 100644
--- a/drivers/usb/host/ehci-platform.c
+++ b/drivers/usb/host/ehci-platform.c
@@ -155,7 +155,8 @@ static int ehci_platform_probe(struct platform_device *dev)
 	if (!pdata)
 		pdata = &ehci_platform_defaults;
 
-	err = dma_coerce_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32));
+	err = dma_coerce_mask_and_coherent(&dev->dev,
+		pdata->dma_mask_64 ? DMA_BIT_MASK(64) : DMA_BIT_MASK(32));
 	if (err)
 		return err;
 
diff --git a/include/linux/usb/ehci_pdriver.h b/include/linux/usb/ehci_pdriver.h
index 6287b398abd9..db0431b39a63 100644
--- a/include/linux/usb/ehci_pdriver.h
+++ b/include/linux/usb/ehci_pdriver.h
@@ -48,6 +48,7 @@ struct usb_ehci_pdata {
 	unsigned	big_endian_mmio:1;
 	unsigned	no_io_watchdog:1;
 	unsigned	reset_on_resume:1;
+	unsigned	dma_mask_64:1;
 
 	/* Turn on all power and clocks */
 	int (*power_on)(struct platform_device *pdev);
-- 
cgit v1.2.3


From 9636c37843c9355c1ab6adcd8491186cbdc3b950 Mon Sep 17 00:00:00 2001
From: Chris Rorvick <chris@rorvick.com>
Date: Wed, 14 Jan 2015 21:52:28 -0600
Subject: usb: Fix typo in `struct usb_host_interface' comment

The descriptor member `bNumEndpoints' is plural.

Signed-off-by: Chris Rorvick <chris@rorvick.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index f89c24a03bd9..4add5661080a 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -82,7 +82,7 @@ struct usb_host_interface {
 	int extralen;
 	unsigned char *extra;   /* Extra descriptors */
 
-	/* array of desc.bNumEndpoint endpoints associated with this
+	/* array of desc.bNumEndpoints endpoints associated with this
 	 * interface setting.  these will be in no particular order.
 	 */
 	struct usb_host_endpoint *endpoint;
-- 
cgit v1.2.3


From 524134d422316a59d5464ccbc12036bbe90c5563 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 21 Jan 2015 14:02:43 -0500
Subject: USB: don't cancel queued resets when unbinding drivers

The USB stack provides a mechanism for drivers to request an
asynchronous device reset (usb_queue_reset_device()).  The mechanism
uses a work item (reset_ws) embedded in the usb_interface structure
used by the driver, and the reset is carried out by a work queue
routine.

The asynchronous reset can race with driver unbinding.  When this
happens, we try to cancel the queued reset before unbinding the
driver, on the theory that the driver won't care about any resets once
it is unbound.

However, thanks to the fact that lockdep now tracks work queue
accesses, this can provoke a lockdep warning in situations where the
device reset causes another interface's driver to be unbound; see

	http://marc.info/?l=linux-usb&m=141893165203776&w=2

for an example.  The reason is that the work routine for reset_ws in
one interface calls cancel_queued_work() for the reset_ws in another
interface.  Lockdep thinks this might lead to a work routine trying to
cancel itself.  The simplest solution is not to cancel queued resets
when unbinding drivers.

This means we now need to acquire a reference to the usb_interface
when queuing a reset_ws work item and to drop the reference when the
work routine finishes.  We also need to make sure that the
usb_interface structure doesn't outlive its parent usb_device; this
means acquiring and dropping a reference when the interface is created
and destroyed.

In addition, cancelling a queued reset can fail (if the device is in
the middle of an earlier reset), and this can cause usb_reset_device()
to try to rebind an interface that has been deallocated (see
http://marc.info/?l=linux-usb&m=142175717016628&w=2 for details).
Acquiring the extra references prevents this failure.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Reported-by: Russell King - ARM Linux <linux@arm.linux.org.uk>
Reported-by: Olivier Sobrie <olivier@sobrie.be>
Tested-by: Olivier Sobrie <olivier@sobrie.be>
Cc: stable <stable@vger.kernel.org> # 3.19
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/driver.c  | 17 -----------------
 drivers/usb/core/hub.c     | 25 +++++++++----------------
 drivers/usb/core/message.c | 23 +++--------------------
 include/linux/usb.h        |  5 -----
 4 files changed, 12 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 874dec31a111..c76ec9758ce3 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -275,21 +275,6 @@ static int usb_unbind_device(struct device *dev)
 	return 0;
 }
 
-/*
- * Cancel any pending scheduled resets
- *
- * [see usb_queue_reset_device()]
- *
- * Called after unconfiguring / when releasing interfaces. See
- * comments in __usb_queue_reset_device() regarding
- * udev->reset_running.
- */
-static void usb_cancel_queued_reset(struct usb_interface *iface)
-{
-	if (iface->reset_running == 0)
-		cancel_work_sync(&iface->reset_ws);
-}
-
 /* called from driver core with dev locked */
 static int usb_probe_interface(struct device *dev)
 {
@@ -380,7 +365,6 @@ static int usb_probe_interface(struct device *dev)
 	usb_set_intfdata(intf, NULL);
 	intf->needs_remote_wakeup = 0;
 	intf->condition = USB_INTERFACE_UNBOUND;
-	usb_cancel_queued_reset(intf);
 
 	/* If the LPM disable succeeded, balance the ref counts. */
 	if (!lpm_disable_error)
@@ -425,7 +409,6 @@ static int usb_unbind_interface(struct device *dev)
 		usb_disable_interface(udev, intf, false);
 
 	driver->disconnect(intf);
-	usb_cancel_queued_reset(intf);
 
 	/* Free streams */
 	for (i = 0, j = 0; i < intf->cur_altsetting->desc.bNumEndpoints; i++) {
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index aeb50bb6ba9c..b4bfa3ac4b12 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -5589,26 +5589,19 @@ EXPORT_SYMBOL_GPL(usb_reset_device);
  *   possible; depending on how the driver attached to each interface
  *   handles ->pre_reset(), the second reset might happen or not.
  *
- * - If a driver is unbound and it had a pending reset, the reset will
- *   be cancelled.
+ * - If the reset is delayed so long that the interface is unbound from
+ *   its driver, the reset will be skipped.
  *
- * - This function can be called during .probe() or .disconnect()
- *   times. On return from .disconnect(), any pending resets will be
- *   cancelled.
- *
- * There is no no need to lock/unlock the @reset_ws as schedule_work()
- * does its own.
- *
- * NOTE: We don't do any reference count tracking because it is not
- *     needed. The lifecycle of the work_struct is tied to the
- *     usb_interface. Before destroying the interface we cancel the
- *     work_struct, so the fact that work_struct is queued and or
- *     running means the interface (and thus, the device) exist and
- *     are referenced.
+ * - This function can be called during .probe().  It can also be called
+ *   during .disconnect(), but doing so is pointless because the reset
+ *   will not occur.  If you really want to reset the device during
+ *   .disconnect(), call usb_reset_device() directly -- but watch out
+ *   for nested unbinding issues!
  */
 void usb_queue_reset_device(struct usb_interface *iface)
 {
-	schedule_work(&iface->reset_ws);
+	if (schedule_work(&iface->reset_ws))
+		usb_get_intf(iface);
 }
 EXPORT_SYMBOL_GPL(usb_queue_reset_device);
 
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index f7b7713cfb2a..f368d2053da5 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -1551,6 +1551,7 @@ static void usb_release_interface(struct device *dev)
 			altsetting_to_usb_interface_cache(intf->altsetting);
 
 	kref_put(&intfc->ref, usb_release_interface_cache);
+	usb_put_dev(interface_to_usbdev(intf));
 	kfree(intf);
 }
 
@@ -1626,24 +1627,6 @@ static struct usb_interface_assoc_descriptor *find_iad(struct usb_device *dev,
 
 /*
  * Internal function to queue a device reset
- *
- * This is initialized into the workstruct in 'struct
- * usb_device->reset_ws' that is launched by
- * message.c:usb_set_configuration() when initializing each 'struct
- * usb_interface'.
- *
- * It is safe to get the USB device without reference counts because
- * the life cycle of @iface is bound to the life cycle of @udev. Then,
- * this function will be ran only if @iface is alive (and before
- * freeing it any scheduled instances of it will have been cancelled).
- *
- * We need to set a flag (usb_dev->reset_running) because when we call
- * the reset, the interfaces might be unbound. The current interface
- * cannot try to remove the queued work as it would cause a deadlock
- * (you cannot remove your work from within your executing
- * workqueue). This flag lets it know, so that
- * usb_cancel_queued_reset() doesn't try to do it.
- *
  * See usb_queue_reset_device() for more details
  */
 static void __usb_queue_reset_device(struct work_struct *ws)
@@ -1655,11 +1638,10 @@ static void __usb_queue_reset_device(struct work_struct *ws)
 
 	rc = usb_lock_device_for_reset(udev, iface);
 	if (rc >= 0) {
-		iface->reset_running = 1;
 		usb_reset_device(udev);
-		iface->reset_running = 0;
 		usb_unlock_device(udev);
 	}
+	usb_put_intf(iface);	/* Undo _get_ in usb_queue_reset_device() */
 }
 
 
@@ -1854,6 +1836,7 @@ free_interfaces:
 		dev_set_name(&intf->dev, "%d-%s:%d.%d",
 			dev->bus->busnum, dev->devpath,
 			configuration, alt->desc.bInterfaceNumber);
+		usb_get_dev(dev);
 	}
 	kfree(new_interfaces);
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 4add5661080a..7ee1b5c3b4cb 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -127,10 +127,6 @@ enum usb_interface_condition {
  *	to the sysfs representation for that device.
  * @pm_usage_cnt: PM usage counter for this interface
  * @reset_ws: Used for scheduling resets from atomic context.
- * @reset_running: set to 1 if the interface is currently running a
- *      queued reset so that usb_cancel_queued_reset() doesn't try to
- *      remove from the workqueue when running inside the worker
- *      thread. See __usb_queue_reset_device().
  * @resetting_device: USB core reset the device, so use alt setting 0 as
  *	current; needs bandwidth alloc after reset.
  *
@@ -181,7 +177,6 @@ struct usb_interface {
 	unsigned needs_remote_wakeup:1;	/* driver requires remote wakeup */
 	unsigned needs_altsetting0:1;	/* switch to altsetting 0 is pending */
 	unsigned needs_binding:1;	/* needs delayed unbind/rebind */
-	unsigned reset_running:1;
 	unsigned resetting_device:1;	/* true: bandwidth alloc after reset */
 
 	struct device dev;		/* interface specific device info */
-- 
cgit v1.2.3


From 79af73079d753b2d04e46f7445716d3b5f914dbd Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Wed, 21 Jan 2015 10:54:10 -0500
Subject: Add security hooks to binder and implement the hooks for SELinux.

Add security hooks to the binder and implement the hooks for SELinux.
The security hooks enable security modules such as SELinux to implement
controls over binder IPC.  The security hooks include support for
controlling what process can become the binder context manager
(binder_set_context_mgr), controlling the ability of a process
to invoke a binder transaction/IPC to another process (binder_transaction),
controlling the ability of a process to transfer a binder reference to
another process (binder_transfer_binder), and controlling the ability
of a process to transfer an open file to another process (binder_transfer_file).

These hooks have been included in the Android kernel trees since Android 4.3.

(Updated to reflect upstream relocation and changes to the binder driver,
changes to the LSM audit data structures, coding style cleanups, and
to add inline documentation for the hooks).

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Nick Kralevich <nnk@google.com>
Acked-by: Jeffrey Vander Stoep <jeffv@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 26 +++++++++++++
 include/linux/security.h            | 58 +++++++++++++++++++++++++++++
 security/capability.c               | 27 ++++++++++++++
 security/security.c                 | 23 ++++++++++++
 security/selinux/hooks.c            | 73 +++++++++++++++++++++++++++++++++++++
 security/selinux/include/classmap.h |  2 +
 6 files changed, 209 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 8c43521d3f11..33b09b6568a4 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -37,6 +37,7 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/pid_namespace.h>
+#include <linux/security.h>
 
 #ifdef CONFIG_ANDROID_BINDER_IPC_32BIT
 #define BINDER_IPC_32BIT 1
@@ -1400,6 +1401,11 @@ static void binder_transaction(struct binder_proc *proc,
 			return_error = BR_DEAD_REPLY;
 			goto err_dead_binder;
 		}
+		if (security_binder_transaction(proc->tsk,
+						target_proc->tsk) < 0) {
+			return_error = BR_FAILED_REPLY;
+			goto err_invalid_target_handle;
+		}
 		if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) {
 			struct binder_transaction *tmp;
 
@@ -1551,6 +1557,11 @@ static void binder_transaction(struct binder_proc *proc,
 				return_error = BR_FAILED_REPLY;
 				goto err_binder_get_ref_for_node_failed;
 			}
+			if (security_binder_transfer_binder(proc->tsk,
+							    target_proc->tsk)) {
+				return_error = BR_FAILED_REPLY;
+				goto err_binder_get_ref_for_node_failed;
+			}
 			ref = binder_get_ref_for_node(target_proc, node);
 			if (ref == NULL) {
 				return_error = BR_FAILED_REPLY;
@@ -1581,6 +1592,11 @@ static void binder_transaction(struct binder_proc *proc,
 				return_error = BR_FAILED_REPLY;
 				goto err_binder_get_ref_failed;
 			}
+			if (security_binder_transfer_binder(proc->tsk,
+							    target_proc->tsk)) {
+				return_error = BR_FAILED_REPLY;
+				goto err_binder_get_ref_failed;
+			}
 			if (ref->node->proc == target_proc) {
 				if (fp->type == BINDER_TYPE_HANDLE)
 					fp->type = BINDER_TYPE_BINDER;
@@ -1638,6 +1654,13 @@ static void binder_transaction(struct binder_proc *proc,
 				return_error = BR_FAILED_REPLY;
 				goto err_fget_failed;
 			}
+			if (security_binder_transfer_file(proc->tsk,
+							  target_proc->tsk,
+							  file) < 0) {
+				fput(file);
+				return_error = BR_FAILED_REPLY;
+				goto err_get_unused_fd_failed;
+			}
 			target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC);
 			if (target_fd < 0) {
 				fput(file);
@@ -2675,6 +2698,9 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
 		ret = -EBUSY;
 		goto out;
 	}
+	ret = security_binder_set_context_mgr(proc->tsk);
+	if (ret < 0)
+		goto out;
 	if (uid_valid(binder_context_mgr_uid)) {
 		if (!uid_eq(binder_context_mgr_uid, curr_euid)) {
 			pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
diff --git a/include/linux/security.h b/include/linux/security.h
index ba96471c11ba..a1b7dbd127ff 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1281,6 +1281,25 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@alter contains the flag indicating whether changes are to be made.
  *	Return 0 if permission is granted.
  *
+ * @binder_set_context_mgr
+ *	Check whether @mgr is allowed to be the binder context manager.
+ *	@mgr contains the task_struct for the task being registered.
+ *	Return 0 if permission is granted.
+ * @binder_transaction
+ *	Check whether @from is allowed to invoke a binder transaction call
+ *	to @to.
+ *	@from contains the task_struct for the sending task.
+ *	@to contains the task_struct for the receiving task.
+ * @binder_transfer_binder
+ *	Check whether @from is allowed to transfer a binder reference to @to.
+ *	@from contains the task_struct for the sending task.
+ *	@to contains the task_struct for the receiving task.
+ * @binder_transfer_file
+ *	Check whether @from is allowed to transfer @file to @to.
+ *	@from contains the task_struct for the sending task.
+ *	@file contains the struct file being transferred.
+ *	@to contains the task_struct for the receiving task.
+ *
  * @ptrace_access_check:
  *	Check permission before allowing the current process to trace the
  *	@child process.
@@ -1441,6 +1460,14 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
 struct security_operations {
 	char name[SECURITY_NAME_MAX + 1];
 
+	int (*binder_set_context_mgr) (struct task_struct *mgr);
+	int (*binder_transaction) (struct task_struct *from,
+				   struct task_struct *to);
+	int (*binder_transfer_binder) (struct task_struct *from,
+				       struct task_struct *to);
+	int (*binder_transfer_file) (struct task_struct *from,
+				     struct task_struct *to, struct file *file);
+
 	int (*ptrace_access_check) (struct task_struct *child, unsigned int mode);
 	int (*ptrace_traceme) (struct task_struct *parent);
 	int (*capget) (struct task_struct *target,
@@ -1739,6 +1766,13 @@ extern void __init security_fixup_ops(struct security_operations *ops);
 
 
 /* Security operations */
+int security_binder_set_context_mgr(struct task_struct *mgr);
+int security_binder_transaction(struct task_struct *from,
+				struct task_struct *to);
+int security_binder_transfer_binder(struct task_struct *from,
+				    struct task_struct *to);
+int security_binder_transfer_file(struct task_struct *from,
+				  struct task_struct *to, struct file *file);
 int security_ptrace_access_check(struct task_struct *child, unsigned int mode);
 int security_ptrace_traceme(struct task_struct *parent);
 int security_capget(struct task_struct *target,
@@ -1927,6 +1961,30 @@ static inline int security_init(void)
 	return 0;
 }
 
+static inline int security_binder_set_context_mgr(struct task_struct *mgr)
+{
+	return 0;
+}
+
+static inline int security_binder_transaction(struct task_struct *from,
+					      struct task_struct *to)
+{
+	return 0;
+}
+
+static inline int security_binder_transfer_binder(struct task_struct *from,
+						  struct task_struct *to)
+{
+	return 0;
+}
+
+static inline int security_binder_transfer_file(struct task_struct *from,
+						struct task_struct *to,
+						struct file *file)
+{
+	return 0;
+}
+
 static inline int security_ptrace_access_check(struct task_struct *child,
 					     unsigned int mode)
 {
diff --git a/security/capability.c b/security/capability.c
index d68c57a62bcf..070dd46f62f4 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -12,6 +12,29 @@
 
 #include <linux/security.h>
 
+static int cap_binder_set_context_mgr(struct task_struct *mgr)
+{
+	return 0;
+}
+
+static int cap_binder_transaction(struct task_struct *from,
+				  struct task_struct *to)
+{
+	return 0;
+}
+
+static int cap_binder_transfer_binder(struct task_struct *from,
+				      struct task_struct *to)
+{
+	return 0;
+}
+
+static int cap_binder_transfer_file(struct task_struct *from,
+				    struct task_struct *to, struct file *file)
+{
+	return 0;
+}
+
 static int cap_syslog(int type)
 {
 	return 0;
@@ -930,6 +953,10 @@ static void cap_audit_rule_free(void *lsmrule)
 
 void __init security_fixup_ops(struct security_operations *ops)
 {
+	set_to_cap_if_null(ops, binder_set_context_mgr);
+	set_to_cap_if_null(ops, binder_transaction);
+	set_to_cap_if_null(ops, binder_transfer_binder);
+	set_to_cap_if_null(ops, binder_transfer_file);
 	set_to_cap_if_null(ops, ptrace_access_check);
 	set_to_cap_if_null(ops, ptrace_traceme);
 	set_to_cap_if_null(ops, capget);
diff --git a/security/security.c b/security/security.c
index 18b35c63fc0c..b196de34b19f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -135,6 +135,29 @@ int __init register_security(struct security_operations *ops)
 
 /* Security operations */
 
+int security_binder_set_context_mgr(struct task_struct *mgr)
+{
+	return security_ops->binder_set_context_mgr(mgr);
+}
+
+int security_binder_transaction(struct task_struct *from,
+				struct task_struct *to)
+{
+	return security_ops->binder_transaction(from, to);
+}
+
+int security_binder_transfer_binder(struct task_struct *from,
+				    struct task_struct *to)
+{
+	return security_ops->binder_transfer_binder(from, to);
+}
+
+int security_binder_transfer_file(struct task_struct *from,
+				  struct task_struct *to, struct file *file)
+{
+	return security_ops->binder_transfer_file(from, to, file);
+}
+
 int security_ptrace_access_check(struct task_struct *child, unsigned int mode)
 {
 #ifdef CONFIG_SECURITY_YAMA_STACKED
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 6da7532893a1..9d984bfb978b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1933,6 +1933,74 @@ static inline u32 open_file_to_av(struct file *file)
 
 /* Hook functions begin here. */
 
+static int selinux_binder_set_context_mgr(struct task_struct *mgr)
+{
+	u32 mysid = current_sid();
+	u32 mgrsid = task_sid(mgr);
+
+	return avc_has_perm(mysid, mgrsid, SECCLASS_BINDER,
+			    BINDER__SET_CONTEXT_MGR, NULL);
+}
+
+static int selinux_binder_transaction(struct task_struct *from,
+				      struct task_struct *to)
+{
+	u32 mysid = current_sid();
+	u32 fromsid = task_sid(from);
+	u32 tosid = task_sid(to);
+	int rc;
+
+	if (mysid != fromsid) {
+		rc = avc_has_perm(mysid, fromsid, SECCLASS_BINDER,
+				  BINDER__IMPERSONATE, NULL);
+		if (rc)
+			return rc;
+	}
+
+	return avc_has_perm(fromsid, tosid, SECCLASS_BINDER, BINDER__CALL,
+			    NULL);
+}
+
+static int selinux_binder_transfer_binder(struct task_struct *from,
+					  struct task_struct *to)
+{
+	u32 fromsid = task_sid(from);
+	u32 tosid = task_sid(to);
+
+	return avc_has_perm(fromsid, tosid, SECCLASS_BINDER, BINDER__TRANSFER,
+			    NULL);
+}
+
+static int selinux_binder_transfer_file(struct task_struct *from,
+					struct task_struct *to,
+					struct file *file)
+{
+	u32 sid = task_sid(to);
+	struct file_security_struct *fsec = file->f_security;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode_security_struct *isec = inode->i_security;
+	struct common_audit_data ad;
+	int rc;
+
+	ad.type = LSM_AUDIT_DATA_PATH;
+	ad.u.path = file->f_path;
+
+	if (sid != fsec->sid) {
+		rc = avc_has_perm(sid, fsec->sid,
+				  SECCLASS_FD,
+				  FD__USE,
+				  &ad);
+		if (rc)
+			return rc;
+	}
+
+	if (unlikely(IS_PRIVATE(inode)))
+		return 0;
+
+	return avc_has_perm(sid, isec->sid, isec->sclass, file_to_av(file),
+			    &ad);
+}
+
 static int selinux_ptrace_access_check(struct task_struct *child,
 				     unsigned int mode)
 {
@@ -5810,6 +5878,11 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer)
 static struct security_operations selinux_ops = {
 	.name =				"selinux",
 
+	.binder_set_context_mgr =	selinux_binder_set_context_mgr,
+	.binder_transaction =		selinux_binder_transaction,
+	.binder_transfer_binder =	selinux_binder_transfer_binder,
+	.binder_transfer_file =		selinux_binder_transfer_file,
+
 	.ptrace_access_check =		selinux_ptrace_access_check,
 	.ptrace_traceme =		selinux_ptrace_traceme,
 	.capget =			selinux_capget,
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index be491a74c1ed..eccd61b3de8a 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -151,5 +151,7 @@ struct security_class_mapping secclass_map[] = {
 	{ "kernel_service", { "use_as_override", "create_files_as", NULL } },
 	{ "tun_socket",
 	  { COMMON_SOCK_PERMS, "attach_queue", NULL } },
+	{ "binder", { "impersonate", "call", "set_context_mgr", "transfer",
+		      NULL } },
 	{ NULL }
   };
-- 
cgit v1.2.3


From d61031ee8df6214d58371a1cc36a0591e242fba0 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Fri, 9 Jan 2015 23:54:34 -0800
Subject: Drivers: hv: vmbus: Support a vmbus API for efficiently sending page
 arrays

Currently, the API for sending a multi-page buffer over VMBUS is limited to
a maximum pfn array of MAX_MULTIPAGE_BUFFER_COUNT. This limitation is
not imposed by the host and unnecessarily limits the maximum payload
that can be sent. Implement an API that does not have this restriction.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/hyperv.h | 31 +++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index c76ffbe59f65..18c4f23dacf1 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -683,6 +683,50 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
 }
 EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer);
 
+/*
+ * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet
+ * using a GPADL Direct packet type.
+ * The buffer includes the vmbus descriptor.
+ */
+int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
+			      struct vmbus_packet_mpb_array *desc,
+			      u32 desc_size,
+			      void *buffer, u32 bufferlen, u64 requestid)
+{
+	int ret;
+	u32 packetlen;
+	u32 packetlen_aligned;
+	struct kvec bufferlist[3];
+	u64 aligned_data = 0;
+	bool signal = false;
+
+	packetlen = desc_size + bufferlen;
+	packetlen_aligned = ALIGN(packetlen, sizeof(u64));
+
+	/* Setup the descriptor */
+	desc->type = VM_PKT_DATA_USING_GPA_DIRECT;
+	desc->flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
+	desc->dataoffset8 = desc_size >> 3; /* in 8-bytes grandularity */
+	desc->length8 = (u16)(packetlen_aligned >> 3);
+	desc->transactionid = requestid;
+	desc->rangecount = 1;
+
+	bufferlist[0].iov_base = desc;
+	bufferlist[0].iov_len = desc_size;
+	bufferlist[1].iov_base = buffer;
+	bufferlist[1].iov_len = bufferlen;
+	bufferlist[2].iov_base = &aligned_data;
+	bufferlist[2].iov_len = (packetlen_aligned - packetlen);
+
+	ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3, &signal);
+
+	if (ret == 0 && signal)
+		vmbus_setevent(channel);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vmbus_sendpacket_mpb_desc);
+
 /*
  * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet
  * using a GPADL Direct packet type.
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 476c685ca6f9..259023a34bec 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -57,6 +57,18 @@ struct hv_multipage_buffer {
 	u64 pfn_array[MAX_MULTIPAGE_BUFFER_COUNT];
 };
 
+/*
+ * Multiple-page buffer array; the pfn array is variable size:
+ * The number of entries in the PFN array is determined by
+ * "len" and "offset".
+ */
+struct hv_mpb_array {
+	/* Length and Offset determines the # of pfns in the array */
+	u32 len;
+	u32 offset;
+	u64 pfn_array[];
+};
+
 /* 0x18 includes the proprietary packet header */
 #define MAX_PAGE_BUFFER_PACKET		(0x18 +			\
 					(sizeof(struct hv_page_buffer) * \
@@ -814,6 +826,18 @@ struct vmbus_channel_packet_multipage_buffer {
 	struct hv_multipage_buffer range;
 } __packed;
 
+/* The format must be the same as struct vmdata_gpa_direct */
+struct vmbus_packet_mpb_array {
+	u16 type;
+	u16 dataoffset8;
+	u16 length8;
+	u16 flags;
+	u64 transactionid;
+	u32 reserved;
+	u32 rangecount;         /* Always 1 in this case */
+	struct hv_mpb_array range;
+} __packed;
+
 
 extern int vmbus_open(struct vmbus_channel *channel,
 			    u32 send_ringbuffersize,
@@ -845,6 +869,13 @@ extern int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel,
 					u32 bufferlen,
 					u64 requestid);
 
+extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
+				     struct vmbus_packet_mpb_array *mpb,
+				     u32 desc_size,
+				     void *buffer,
+				     u32 bufferlen,
+				     u64 requestid);
+
 extern int vmbus_establish_gpadl(struct vmbus_channel *channel,
 				      void *kbuffer,
 				      u32 size,
-- 
cgit v1.2.3


From 67fae053bfc6e84144150e4c6c62670abb215c33 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Tue, 20 Jan 2015 16:45:05 +0100
Subject: Drivers: hv: rename sc_lock to the more generic lock

sc_lock spinlock in struct vmbus_channel is being used to not only protect the
sc_list field, e.g. vmbus_open() function uses it to implement test-and-set
access to the state field. Rename it to the more generic 'lock' and add the
description.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c      |  6 +++---
 drivers/hv/channel_mgmt.c | 10 +++++-----
 include/linux/hyperv.h    |  7 ++++++-
 3 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 18c4f23dacf1..2978f5ee8d2a 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -73,14 +73,14 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
 	unsigned long flags;
 	int ret, t, err = 0;
 
-	spin_lock_irqsave(&newchannel->sc_lock, flags);
+	spin_lock_irqsave(&newchannel->lock, flags);
 	if (newchannel->state == CHANNEL_OPEN_STATE) {
 		newchannel->state = CHANNEL_OPENING_STATE;
 	} else {
-		spin_unlock_irqrestore(&newchannel->sc_lock, flags);
+		spin_unlock_irqrestore(&newchannel->lock, flags);
 		return -EINVAL;
 	}
-	spin_unlock_irqrestore(&newchannel->sc_lock, flags);
+	spin_unlock_irqrestore(&newchannel->lock, flags);
 
 	newchannel->onchannel_callback = onchannelcallback;
 	newchannel->channel_callback_context = context;
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 704c0e00f8d2..1e0b996ed643 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -146,7 +146,7 @@ static struct vmbus_channel *alloc_channel(void)
 		return NULL;
 
 	spin_lock_init(&channel->inbound_lock);
-	spin_lock_init(&channel->sc_lock);
+	spin_lock_init(&channel->lock);
 
 	INIT_LIST_HEAD(&channel->sc_list);
 	INIT_LIST_HEAD(&channel->percpu_list);
@@ -246,9 +246,9 @@ static void vmbus_process_rescind_offer(struct work_struct *work)
 		spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
 	} else {
 		primary_channel = channel->primary_channel;
-		spin_lock_irqsave(&primary_channel->sc_lock, flags);
+		spin_lock_irqsave(&primary_channel->lock, flags);
 		list_del(&channel->sc_list);
-		spin_unlock_irqrestore(&primary_channel->sc_lock, flags);
+		spin_unlock_irqrestore(&primary_channel->lock, flags);
 	}
 	free_channel(channel);
 }
@@ -323,9 +323,9 @@ static void vmbus_process_offer(struct work_struct *work)
 			 * Process the sub-channel.
 			 */
 			newchannel->primary_channel = channel;
-			spin_lock_irqsave(&channel->sc_lock, flags);
+			spin_lock_irqsave(&channel->lock, flags);
 			list_add_tail(&newchannel->sc_list, &channel->sc_list);
-			spin_unlock_irqrestore(&channel->sc_lock, flags);
+			spin_unlock_irqrestore(&channel->lock, flags);
 
 			if (newchannel->target_cpu != get_cpu()) {
 				put_cpu();
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 259023a34bec..5a2ba674795e 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -734,7 +734,12 @@ struct vmbus_channel {
 	 */
 	void (*sc_creation_callback)(struct vmbus_channel *new_sc);
 
-	spinlock_t sc_lock;
+	/*
+	 * The spinlock to protect the structure. It is being used to protect
+	 * test-and-set access to various attributes of the structure as well
+	 * as all sc_list operations.
+	 */
+	spinlock_t lock;
 	/*
 	 * All Sub-channels of a primary channel are linked here.
 	 */
-- 
cgit v1.2.3


From 77b3da6e3232d3b4d4b8addb4b05799fe98f3bf8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 25 Jan 2015 15:10:32 -0500
Subject: new primitive: debugfs_create_automount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/debugfs/inode.c      | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/debugfs.h |  5 +++++
 2 files changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 1219dff8e18f..957c40ce09f6 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -175,6 +175,18 @@ static const struct super_operations debugfs_super_operations = {
 	.show_options	= debugfs_show_options,
 };
 
+static struct vfsmount *debugfs_automount(struct path *path)
+{
+	struct vfsmount *(*f)(void *);
+	f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
+	return f(path->dentry->d_inode->i_private);
+}
+
+static const struct dentry_operations debugfs_dops = {
+	.d_delete = always_delete_dentry,
+	.d_automount = debugfs_automount,
+};
+
 static int debug_fill_super(struct super_block *sb, void *data, int silent)
 {
 	static struct tree_descr debug_files[] = {{""}};
@@ -199,6 +211,7 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
 		goto fail;
 
 	sb->s_op = &debugfs_super_operations;
+	sb->s_d_op = &debugfs_dops;
 
 	debugfs_apply_options(sb);
 
@@ -367,6 +380,41 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
 }
 EXPORT_SYMBOL_GPL(debugfs_create_dir);
 
+/**
+ * debugfs_create_automount - create automount point in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @f: function to be called when pathname resolution steps on that one.
+ * @data: opaque argument to pass to f().
+ *
+ * @f should return what ->d_automount() would.
+ */
+struct dentry *debugfs_create_automount(const char *name,
+					struct dentry *parent,
+					struct vfsmount *(*f)(void *),
+					void *data)
+{
+	struct dentry *dentry = start_creating(name, parent);
+	struct inode *inode;
+
+	if (IS_ERR(dentry))
+		return NULL;
+
+	inode = debugfs_get_inode(dentry->d_sb);
+	if (unlikely(!inode))
+		return failed_creating(dentry);
+
+	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+	inode->i_flags |= S_AUTOMOUNT;
+	inode->i_private = data;
+	dentry->d_fsdata = (void *)f;
+	d_instantiate(dentry, inode);
+	return end_creating(dentry);
+}
+EXPORT_SYMBOL(debugfs_create_automount);
+
 /**
  * debugfs_create_symlink- create a symbolic link in the debugfs filesystem
  * @name: a pointer to a string containing the name of the symbolic link to
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index da4c4983adbe..ea149a24a1f2 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -56,6 +56,11 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
 struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 				      const char *dest);
 
+struct dentry *debugfs_create_automount(const char *name,
+					struct dentry *parent,
+					struct vfsmount *(*f)(void *),
+					void *data);
+
 void debugfs_remove(struct dentry *dentry);
 void debugfs_remove_recursive(struct dentry *dentry);
 
-- 
cgit v1.2.3


From 03af03ad7c6723e0f87568e4ffe66ceb9608bfe7 Mon Sep 17 00:00:00 2001
From: Karol Wrona <k.wrona@samsung.com>
Date: Wed, 7 Jan 2015 19:36:11 +0100
Subject: iio: Add new operating mode for non triggered sw buffers

There was a need for non triggered software buffer type.  It can be used when
triggered model does not fit and INDIO_BUFFER_HARDWARE causes confusion because
the data stream can be obtained not directly form hardware backend.

Suggested-by: Jonathan Cameron <jic23@kernel.org>
Signed-off-by: Karol Wrona <k.wrona@samsung.com>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-buffer.c | 2 ++
 include/linux/iio/iio.h           | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 403b72878b0b..71333140d42c 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -639,6 +639,8 @@ static int __iio_update_buffers(struct iio_dev *indio_dev,
 		indio_dev->currentmode = INDIO_BUFFER_TRIGGERED;
 	} else if (indio_dev->modes & INDIO_BUFFER_HARDWARE) {
 		indio_dev->currentmode = INDIO_BUFFER_HARDWARE;
+	} else if (indio_dev->modes & INDIO_BUFFER_SOFTWARE) {
+		indio_dev->currentmode = INDIO_BUFFER_SOFTWARE;
 	} else { /* Should never be reached */
 		ret = -EINVAL;
 		goto error_run_postdisable;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 878d861b0610..590202024857 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -286,10 +286,11 @@ static inline s64 iio_get_time_ns(void)
 /* Device operating modes */
 #define INDIO_DIRECT_MODE		0x01
 #define INDIO_BUFFER_TRIGGERED		0x02
+#define INDIO_BUFFER_SOFTWARE		0x04
 #define INDIO_BUFFER_HARDWARE		0x08
 
 #define INDIO_ALL_BUFFER_MODES					\
-	(INDIO_BUFFER_TRIGGERED | INDIO_BUFFER_HARDWARE)
+	(INDIO_BUFFER_TRIGGERED | INDIO_BUFFER_HARDWARE | INDIO_BUFFER_SOFTWARE)
 
 #define INDIO_MAX_RAW_ELEMENTS		4
 
@@ -593,7 +594,8 @@ void devm_iio_trigger_free(struct device *dev, struct iio_trigger *iio_trig);
 static inline bool iio_buffer_enabled(struct iio_dev *indio_dev)
 {
 	return indio_dev->currentmode
-		& (INDIO_BUFFER_TRIGGERED | INDIO_BUFFER_HARDWARE);
+		& (INDIO_BUFFER_TRIGGERED | INDIO_BUFFER_HARDWARE |
+		   INDIO_BUFFER_SOFTWARE);
 }
 
 /**
-- 
cgit v1.2.3


From d6cb125b9983e1ea9444f794b2d3ed5e3ad737b7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 24 Dec 2014 22:47:00 -0500
Subject: kill d_validate()

no users left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 31 -------------------------------
 include/linux/dcache.h |  3 ---
 2 files changed, 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index e368d4f412f9..40432e59d72e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2187,37 +2187,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
 }
 EXPORT_SYMBOL(d_hash_and_lookup);
 
-/**
- * d_validate - verify dentry provided from insecure source (deprecated)
- * @dentry: The dentry alleged to be valid child of @dparent
- * @dparent: The parent dentry (known to be valid)
- *
- * An insecure source has sent us a dentry, here we verify it and dget() it.
- * This is used by ncpfs in its readdir implementation.
- * Zero is returned in the dentry is invalid.
- *
- * This function is slow for big directories, and deprecated, do not use it.
- */
-int d_validate(struct dentry *dentry, struct dentry *dparent)
-{
-	struct dentry *child;
-
-	spin_lock(&dparent->d_lock);
-	list_for_each_entry(child, &dparent->d_subdirs, d_child) {
-		if (dentry == child) {
-			spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-			__dget_dlock(dentry);
-			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dparent->d_lock);
-			return 1;
-		}
-	}
-	spin_unlock(&dparent->d_lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(d_validate);
-
 /*
  * When a file is deleted, we have two options:
  * - turn this dentry into a negative dentry
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 5a813988e6d4..92c08cf7670e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -319,9 +319,6 @@ static inline unsigned d_count(const struct dentry *dentry)
 	return dentry->d_lockref.count;
 }
 
-/* validate "insecure" dentry pointer */
-extern int d_validate(struct dentry *, struct dentry *);
-
 /*
  * helper function for dentry_operations.d_dname() members
  */
-- 
cgit v1.2.3


From 9e251d02041432487d89cb340e72490c4bbc198a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 9 Jan 2015 20:40:02 -0500
Subject: kill pin_put()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_pin.c            | 11 -----------
 include/linux/fs_pin.h |  1 -
 kernel/acct.c          | 14 ++++++++++----
 3 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 9368236ca100..f173313760b8 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -4,19 +4,8 @@
 #include "internal.h"
 #include "mount.h"
 
-static void pin_free_rcu(struct rcu_head *head)
-{
-	kfree(container_of(head, struct fs_pin, rcu));
-}
-
 static DEFINE_SPINLOCK(pin_lock);
 
-void pin_put(struct fs_pin *p)
-{
-	if (atomic_long_dec_and_test(&p->count))
-		call_rcu(&p->rcu, pin_free_rcu);
-}
-
 void pin_remove(struct fs_pin *pin)
 {
 	spin_lock(&pin_lock);
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index f66525e72ccf..68a54b7741aa 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -12,6 +12,5 @@ struct fs_pin {
 	void (*kill)(struct fs_pin *);
 };
 
-void pin_put(struct fs_pin *);
 void pin_remove(struct fs_pin *);
 void pin_insert(struct fs_pin *, struct vfsmount *);
diff --git a/kernel/acct.c b/kernel/acct.c
index 33738ef972f3..7bb9e659a7da 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -124,6 +124,12 @@ out:
 	return acct->active;
 }
 
+static void acct_put(struct bsd_acct_struct *p)
+{
+	if (atomic_long_dec_and_test(&p->pin.count))
+		kfree_rcu(p, pin.rcu);
+}
+
 static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
 {
 	struct bsd_acct_struct *res;
@@ -144,7 +150,7 @@ again:
 	mutex_lock(&res->lock);
 	if (!res->ns) {
 		mutex_unlock(&res->lock);
-		pin_put(&res->pin);
+		acct_put(res);
 		goto again;
 	}
 	return res;
@@ -175,7 +181,7 @@ static void acct_kill(struct bsd_acct_struct *acct,
 		acct->ns = NULL;
 		atomic_long_dec(&acct->pin.count);
 		mutex_unlock(&acct->lock);
-		pin_put(&acct->pin);
+		acct_put(acct);
 	}
 }
 
@@ -186,7 +192,7 @@ static void acct_pin_kill(struct fs_pin *pin)
 	mutex_lock(&acct->lock);
 	if (!acct->ns) {
 		mutex_unlock(&acct->lock);
-		pin_put(pin);
+		acct_put(acct);
 		acct = NULL;
 	}
 	acct_kill(acct, NULL);
@@ -576,7 +582,7 @@ static void slow_acct_process(struct pid_namespace *ns)
 		if (acct) {
 			do_acct_process(acct);
 			mutex_unlock(&acct->lock);
-			pin_put(&acct->pin);
+			acct_put(acct);
 		}
 	}
 }
-- 
cgit v1.2.3


From 360f54796ed65939093ae373b92ebd5ef3341776 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 9 Jan 2015 15:19:03 -0800
Subject: dcache: let the dentry count go down to zero without taking d_lock

We can be more aggressive about this, if we are clever and careful. This is subtle.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c             | 118 ++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/lockref.h |   3 +-
 lib/lockref.c           |  36 +++++++++++----
 3 files changed, 144 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 40432e59d72e..a14d00e9839e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -508,7 +508,7 @@ static void __dentry_kill(struct dentry *dentry)
 	 * dentry_iput drops the locks, at which point nobody (except
 	 * transient RCU lookups) can reach this dentry.
 	 */
-	BUG_ON((int)dentry->d_lockref.count > 0);
+	BUG_ON(dentry->d_lockref.count > 0);
 	this_cpu_dec(nr_dentry);
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
@@ -561,7 +561,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
 	struct dentry *parent = dentry->d_parent;
 	if (IS_ROOT(dentry))
 		return NULL;
-	if (unlikely((int)dentry->d_lockref.count < 0))
+	if (unlikely(dentry->d_lockref.count < 0))
 		return NULL;
 	if (likely(spin_trylock(&parent->d_lock)))
 		return parent;
@@ -590,6 +590,110 @@ again:
 	return parent;
 }
 
+/*
+ * Try to do a lockless dput(), and return whether that was successful.
+ *
+ * If unsuccessful, we return false, having already taken the dentry lock.
+ *
+ * The caller needs to hold the RCU read lock, so that the dentry is
+ * guaranteed to stay around even if the refcount goes down to zero!
+ */
+static inline bool fast_dput(struct dentry *dentry)
+{
+	int ret;
+	unsigned int d_flags;
+
+	/*
+	 * If we have a d_op->d_delete() operation, we sould not
+	 * let the dentry count go to zero, so use "put__or_lock".
+	 */
+	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
+		return lockref_put_or_lock(&dentry->d_lockref);
+
+	/*
+	 * .. otherwise, we can try to just decrement the
+	 * lockref optimistically.
+	 */
+	ret = lockref_put_return(&dentry->d_lockref);
+
+	/*
+	 * If the lockref_put_return() failed due to the lock being held
+	 * by somebody else, the fast path has failed. We will need to
+	 * get the lock, and then check the count again.
+	 */
+	if (unlikely(ret < 0)) {
+		spin_lock(&dentry->d_lock);
+		if (dentry->d_lockref.count > 1) {
+			dentry->d_lockref.count--;
+			spin_unlock(&dentry->d_lock);
+			return 1;
+		}
+		return 0;
+	}
+
+	/*
+	 * If we weren't the last ref, we're done.
+	 */
+	if (ret)
+		return 1;
+
+	/*
+	 * Careful, careful. The reference count went down
+	 * to zero, but we don't hold the dentry lock, so
+	 * somebody else could get it again, and do another
+	 * dput(), and we need to not race with that.
+	 *
+	 * However, there is a very special and common case
+	 * where we don't care, because there is nothing to
+	 * do: the dentry is still hashed, it does not have
+	 * a 'delete' op, and it's referenced and already on
+	 * the LRU list.
+	 *
+	 * NOTE! Since we aren't locked, these values are
+	 * not "stable". However, it is sufficient that at
+	 * some point after we dropped the reference the
+	 * dentry was hashed and the flags had the proper
+	 * value. Other dentry users may have re-gotten
+	 * a reference to the dentry and change that, but
+	 * our work is done - we can leave the dentry
+	 * around with a zero refcount.
+	 */
+	smp_rmb();
+	d_flags = ACCESS_ONCE(dentry->d_flags);
+	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST;
+
+	/* Nothing to do? Dropping the reference was all we needed? */
+	if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
+		return 1;
+
+	/*
+	 * Not the fast normal case? Get the lock. We've already decremented
+	 * the refcount, but we'll need to re-check the situation after
+	 * getting the lock.
+	 */
+	spin_lock(&dentry->d_lock);
+
+	/*
+	 * Did somebody else grab a reference to it in the meantime, and
+	 * we're no longer the last user after all? Alternatively, somebody
+	 * else could have killed it and marked it dead. Either way, we
+	 * don't need to do anything else.
+	 */
+	if (dentry->d_lockref.count) {
+		spin_unlock(&dentry->d_lock);
+		return 1;
+	}
+
+	/*
+	 * Re-get the reference we optimistically dropped. We hold the
+	 * lock, and we just tested that it was zero, so we can just
+	 * set it to 1.
+	 */
+	dentry->d_lockref.count = 1;
+	return 0;
+}
+
+
 /* 
  * This is dput
  *
@@ -622,8 +726,14 @@ void dput(struct dentry *dentry)
 		return;
 
 repeat:
-	if (lockref_put_or_lock(&dentry->d_lockref))
+	rcu_read_lock();
+	if (likely(fast_dput(dentry))) {
+		rcu_read_unlock();
 		return;
+	}
+
+	/* Slow case: now with the dentry lock held */
+	rcu_read_unlock();
 
 	/* Unreachable? Get rid of it */
 	if (unlikely(d_unhashed(dentry)))
@@ -810,7 +920,7 @@ static void shrink_dentry_list(struct list_head *list)
 		 * We found an inuse dentry which was not removed from
 		 * the LRU because of laziness during lookup. Do not free it.
 		 */
-		if ((int)dentry->d_lockref.count > 0) {
+		if (dentry->d_lockref.count > 0) {
 			spin_unlock(&dentry->d_lock);
 			if (parent)
 				spin_unlock(&parent->d_lock);
diff --git a/include/linux/lockref.h b/include/linux/lockref.h
index 4bfde0e99ed5..b10b122dd099 100644
--- a/include/linux/lockref.h
+++ b/include/linux/lockref.h
@@ -28,12 +28,13 @@ struct lockref {
 #endif
 		struct {
 			spinlock_t lock;
-			unsigned int count;
+			int count;
 		};
 	};
 };
 
 extern void lockref_get(struct lockref *);
+extern int lockref_put_return(struct lockref *);
 extern int lockref_get_not_zero(struct lockref *);
 extern int lockref_get_or_lock(struct lockref *);
 extern int lockref_put_or_lock(struct lockref *);
diff --git a/lib/lockref.c b/lib/lockref.c
index d2233de9a86e..ecb9a665ec19 100644
--- a/lib/lockref.c
+++ b/lib/lockref.c
@@ -60,7 +60,7 @@ void lockref_get(struct lockref *lockref)
 EXPORT_SYMBOL(lockref_get);
 
 /**
- * lockref_get_not_zero - Increments count unless the count is 0
+ * lockref_get_not_zero - Increments count unless the count is 0 or dead
  * @lockref: pointer to lockref structure
  * Return: 1 if count updated successfully or 0 if count was zero
  */
@@ -70,7 +70,7 @@ int lockref_get_not_zero(struct lockref *lockref)
 
 	CMPXCHG_LOOP(
 		new.count++;
-		if (!old.count)
+		if (old.count <= 0)
 			return 0;
 	,
 		return 1;
@@ -78,7 +78,7 @@ int lockref_get_not_zero(struct lockref *lockref)
 
 	spin_lock(&lockref->lock);
 	retval = 0;
-	if (lockref->count) {
+	if (lockref->count > 0) {
 		lockref->count++;
 		retval = 1;
 	}
@@ -88,7 +88,7 @@ int lockref_get_not_zero(struct lockref *lockref)
 EXPORT_SYMBOL(lockref_get_not_zero);
 
 /**
- * lockref_get_or_lock - Increments count unless the count is 0
+ * lockref_get_or_lock - Increments count unless the count is 0 or dead
  * @lockref: pointer to lockref structure
  * Return: 1 if count updated successfully or 0 if count was zero
  * and we got the lock instead.
@@ -97,14 +97,14 @@ int lockref_get_or_lock(struct lockref *lockref)
 {
 	CMPXCHG_LOOP(
 		new.count++;
-		if (!old.count)
+		if (old.count <= 0)
 			break;
 	,
 		return 1;
 	);
 
 	spin_lock(&lockref->lock);
-	if (!lockref->count)
+	if (lockref->count <= 0)
 		return 0;
 	lockref->count++;
 	spin_unlock(&lockref->lock);
@@ -112,6 +112,26 @@ int lockref_get_or_lock(struct lockref *lockref)
 }
 EXPORT_SYMBOL(lockref_get_or_lock);
 
+/**
+ * lockref_put_return - Decrement reference count if possible
+ * @lockref: pointer to lockref structure
+ *
+ * Decrement the reference count and return the new value.
+ * If the lockref was dead or locked, return an error.
+ */
+int lockref_put_return(struct lockref *lockref)
+{
+	CMPXCHG_LOOP(
+		new.count--;
+		if (old.count <= 0)
+			return -1;
+	,
+		return new.count;
+	);
+	return -1;
+}
+EXPORT_SYMBOL(lockref_put_return);
+
 /**
  * lockref_put_or_lock - decrements count unless count <= 1 before decrement
  * @lockref: pointer to lockref structure
@@ -158,7 +178,7 @@ int lockref_get_not_dead(struct lockref *lockref)
 
 	CMPXCHG_LOOP(
 		new.count++;
-		if ((int)old.count < 0)
+		if (old.count < 0)
 			return 0;
 	,
 		return 1;
@@ -166,7 +186,7 @@ int lockref_get_not_dead(struct lockref *lockref)
 
 	spin_lock(&lockref->lock);
 	retval = 0;
-	if ((int) lockref->count >= 0) {
+	if (lockref->count >= 0) {
 		lockref->count++;
 		retval = 1;
 	}
-- 
cgit v1.2.3


From 34cece2e8a1d2b66f00e153a19b80b4d4cec4eb8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 10 Jan 2015 12:47:38 -0500
Subject: take count and rcu_head out of fs_pin

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs_pin.h | 10 ++--------
 kernel/acct.c          | 14 ++++++++------
 2 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index 68a54b7741aa..a2e71912e758 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -1,14 +1,8 @@
 #include <linux/fs.h>
 
 struct fs_pin {
-	atomic_long_t		count;
-	union {
-		struct {
-			struct hlist_node	s_list;
-			struct hlist_node	m_list;
-		};
-		struct rcu_head rcu;
-	};
+	struct hlist_node	s_list;
+	struct hlist_node	m_list;
 	void (*kill)(struct fs_pin *);
 };
 
diff --git a/kernel/acct.c b/kernel/acct.c
index a74f3d1a0c26..b8fbefb8678f 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -80,6 +80,8 @@ static void do_acct_process(struct bsd_acct_struct *acct);
 
 struct bsd_acct_struct {
 	struct fs_pin		pin;
+	atomic_long_t		count;
+	struct rcu_head		rcu;
 	struct mutex		lock;
 	int			active;
 	unsigned long		needcheck;
@@ -126,8 +128,8 @@ out:
 
 static void acct_put(struct bsd_acct_struct *p)
 {
-	if (atomic_long_dec_and_test(&p->pin.count))
-		kfree_rcu(p, pin.rcu);
+	if (atomic_long_dec_and_test(&p->count))
+		kfree_rcu(p, rcu);
 }
 
 static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
@@ -141,7 +143,7 @@ again:
 		rcu_read_unlock();
 		return NULL;
 	}
-	if (!atomic_long_inc_not_zero(&res->pin.count)) {
+	if (!atomic_long_inc_not_zero(&res->count)) {
 		rcu_read_unlock();
 		cpu_relax();
 		goto again;
@@ -179,7 +181,7 @@ static void acct_kill(struct bsd_acct_struct *acct,
 		pin_remove(&acct->pin);
 		ns->bacct = new;
 		acct->ns = NULL;
-		atomic_long_dec(&acct->pin.count);
+		atomic_long_dec(&acct->count);
 		mutex_unlock(&acct->lock);
 		acct_put(acct);
 	}
@@ -189,7 +191,7 @@ static void acct_pin_kill(struct fs_pin *pin)
 {
 	struct bsd_acct_struct *acct;
 	acct = container_of(pin, struct bsd_acct_struct, pin);
-	if (!atomic_long_inc_not_zero(&pin->count)) {
+	if (!atomic_long_inc_not_zero(&acct->count)) {
 		rcu_read_unlock();
 		cpu_relax();
 		return;
@@ -250,7 +252,7 @@ static int acct_on(struct filename *pathname)
 	mnt = file->f_path.mnt;
 	file->f_path.mnt = internal;
 
-	atomic_long_set(&acct->pin.count, 1);
+	atomic_long_set(&acct->count, 1);
 	acct->pin.kill = acct_pin_kill;
 	acct->file = file;
 	acct->needcheck = jiffies;
-- 
cgit v1.2.3


From fdab684d7202774bfd8762d4a656a553b787c8ec Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 11 Jan 2015 10:57:27 -0500
Subject: allow attaching fs_pin to a group not associated with some superblock

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_pin.c            | 20 +++++++++++++-------
 fs/internal.h          |  2 +-
 fs/super.c             |  4 ++--
 include/linux/fs_pin.h |  1 +
 4 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 5eb39a93a560..50ef7d2ef03c 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -14,14 +14,20 @@ void pin_remove(struct fs_pin *pin)
 	spin_unlock(&pin_lock);
 }
 
-void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
 {
 	spin_lock(&pin_lock);
-	hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
+	if (p)
+		hlist_add_head(&pin->s_list, p);
 	hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
 	spin_unlock(&pin_lock);
 }
 
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+{
+	pin_insert_group(pin, m, &m->mnt_sb->s_pins);
+}
+
 void mnt_pin_kill(struct mount *m)
 {
 	while (1) {
@@ -38,18 +44,18 @@ void mnt_pin_kill(struct mount *m)
 	}
 }
 
-void sb_pin_kill(struct super_block *sb)
+void group_pin_kill(struct hlist_head *p)
 {
 	while (1) {
-		struct hlist_node *p;
+		struct hlist_node *q;
 		struct fs_pin *pin;
 		rcu_read_lock();
-		p = ACCESS_ONCE(sb->s_pins.first);
-		if (!p) {
+		q = ACCESS_ONCE(p->first);
+		if (!q) {
 			rcu_read_unlock();
 			break;
 		}
-		pin = hlist_entry(p, struct fs_pin, s_list);
+		pin = hlist_entry(q, struct fs_pin, s_list);
 		pin->kill(pin);
 	}
 }
diff --git a/fs/internal.h b/fs/internal.h
index e9a61fe67575..a6efd2f09ba3 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -145,7 +145,7 @@ extern const struct file_operations pipefifo_fops;
 /*
  * fs_pin.c
  */
-extern void sb_pin_kill(struct super_block *sb);
+extern void group_pin_kill(struct hlist_head *p);
 extern void mnt_pin_kill(struct mount *m);
 
 /*
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..2d822459bc3d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -706,9 +706,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
 
 	if (remount_ro) {
-		if (sb->s_pins.first) {
+		if (!hlist_empty(&sb->s_pins)) {
 			up_write(&sb->s_umount);
-			sb_pin_kill(sb);
+			group_pin_kill(&sb->s_pins);
 			down_write(&sb->s_umount);
 			if (!sb->s_root)
 				return 0;
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index a2e71912e758..2be38d1464ae 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -7,4 +7,5 @@ struct fs_pin {
 };
 
 void pin_remove(struct fs_pin *);
+void pin_insert_group(struct fs_pin *, struct vfsmount *, struct hlist_head *);
 void pin_insert(struct fs_pin *, struct vfsmount *);
-- 
cgit v1.2.3


From 59eda0e07f43c950d31756213b607af673e551f0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 10 Jan 2015 17:53:21 -0500
Subject: new fs_pin killing logics

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_pin.c                   | 54 +++++++++++++++++++++++++----
 include/linux/fs_pin.h        | 13 ++++++-
 include/linux/pid_namespace.h |  4 +--
 kernel/acct.c                 | 81 ++++++++++++++++++-------------------------
 4 files changed, 96 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 50ef7d2ef03c..0c77bdc238b2 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -1,4 +1,5 @@
 #include <linux/fs.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/fs_pin.h>
 #include "internal.h"
@@ -12,6 +13,10 @@ void pin_remove(struct fs_pin *pin)
 	hlist_del(&pin->m_list);
 	hlist_del(&pin->s_list);
 	spin_unlock(&pin_lock);
+	spin_lock_irq(&pin->wait.lock);
+	pin->done = 1;
+	wake_up_locked(&pin->wait);
+	spin_unlock_irq(&pin->wait.lock);
 }
 
 void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
@@ -28,19 +33,58 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m)
 	pin_insert_group(pin, m, &m->mnt_sb->s_pins);
 }
 
+void pin_kill(struct fs_pin *p)
+{
+	wait_queue_t wait;
+
+	if (!p) {
+		rcu_read_unlock();
+		return;
+	}
+	init_wait(&wait);
+	spin_lock_irq(&p->wait.lock);
+	if (likely(!p->done)) {
+		p->done = -1;
+		spin_unlock_irq(&p->wait.lock);
+		rcu_read_unlock();
+		p->kill(p);
+		return;
+	}
+	if (p->done > 0) {
+		spin_unlock_irq(&p->wait.lock);
+		rcu_read_unlock();
+		return;
+	}
+	__add_wait_queue(&p->wait, &wait);
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&p->wait.lock);
+		rcu_read_unlock();
+		schedule();
+		rcu_read_lock();
+		if (likely(list_empty(&wait.task_list)))
+			break;
+		/* OK, we know p couldn't have been freed yet */
+		spin_lock_irq(&p->wait.lock);
+		if (p->done > 0) {
+			spin_unlock_irq(&p->wait.lock);
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
 void mnt_pin_kill(struct mount *m)
 {
 	while (1) {
 		struct hlist_node *p;
-		struct fs_pin *pin;
 		rcu_read_lock();
 		p = ACCESS_ONCE(m->mnt_pins.first);
 		if (!p) {
 			rcu_read_unlock();
 			break;
 		}
-		pin = hlist_entry(p, struct fs_pin, m_list);
-		pin->kill(pin);
+		pin_kill(hlist_entry(p, struct fs_pin, m_list));
 	}
 }
 
@@ -48,14 +92,12 @@ void group_pin_kill(struct hlist_head *p)
 {
 	while (1) {
 		struct hlist_node *q;
-		struct fs_pin *pin;
 		rcu_read_lock();
 		q = ACCESS_ONCE(p->first);
 		if (!q) {
 			rcu_read_unlock();
 			break;
 		}
-		pin = hlist_entry(q, struct fs_pin, s_list);
-		pin->kill(pin);
+		pin_kill(hlist_entry(q, struct fs_pin, s_list));
 	}
 }
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index 2be38d1464ae..9dc4e0384bfb 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -1,11 +1,22 @@
-#include <linux/fs.h>
+#include <linux/wait.h>
 
 struct fs_pin {
+	wait_queue_head_t	wait;
+	int			done;
 	struct hlist_node	s_list;
 	struct hlist_node	m_list;
 	void (*kill)(struct fs_pin *);
 };
 
+struct vfsmount;
+
+static inline void init_fs_pin(struct fs_pin *p, void (*kill)(struct fs_pin *))
+{
+	init_waitqueue_head(&p->wait);
+	p->kill = kill;
+}
+
 void pin_remove(struct fs_pin *);
 void pin_insert_group(struct fs_pin *, struct vfsmount *, struct hlist_head *);
 void pin_insert(struct fs_pin *, struct vfsmount *);
+void pin_kill(struct fs_pin *);
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index b9cf6c51b181..918b117a7cd3 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -19,7 +19,7 @@ struct pidmap {
 #define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
 #define PIDMAP_ENTRIES		((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
 
-struct bsd_acct_struct;
+struct fs_pin;
 
 struct pid_namespace {
 	struct kref kref;
@@ -37,7 +37,7 @@ struct pid_namespace {
 	struct dentry *proc_thread_self;
 #endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
-	struct bsd_acct_struct *bacct;
+	struct fs_pin *bacct;
 #endif
 	struct user_namespace *user_ns;
 	struct work_struct proc_work;
diff --git a/kernel/acct.c b/kernel/acct.c
index cf6588ab517b..e6c10d1a4058 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -76,7 +76,6 @@ int acct_parm[3] = {4, 2, 30};
 /*
  * External references and all of the globals.
  */
-static void do_acct_process(struct bsd_acct_struct *acct);
 
 struct bsd_acct_struct {
 	struct fs_pin		pin;
@@ -91,6 +90,8 @@ struct bsd_acct_struct {
 	struct completion	done;
 };
 
+static void do_acct_process(struct bsd_acct_struct *acct);
+
 /*
  * Check the amount of free space and suspend/resume accordingly.
  */
@@ -132,13 +133,18 @@ static void acct_put(struct bsd_acct_struct *p)
 		kfree_rcu(p, rcu);
 }
 
+static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
+{
+	return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
+}
+
 static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
 {
 	struct bsd_acct_struct *res;
 again:
 	smp_rmb();
 	rcu_read_lock();
-	res = ACCESS_ONCE(ns->bacct);
+	res = to_acct(ACCESS_ONCE(ns->bacct));
 	if (!res) {
 		rcu_read_unlock();
 		return NULL;
@@ -150,7 +156,7 @@ again:
 	}
 	rcu_read_unlock();
 	mutex_lock(&res->lock);
-	if (!res->ns) {
+	if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
 		mutex_unlock(&res->lock);
 		acct_put(res);
 		goto again;
@@ -158,6 +164,19 @@ again:
 	return res;
 }
 
+static void acct_pin_kill(struct fs_pin *pin)
+{
+	struct bsd_acct_struct *acct = to_acct(pin);
+	mutex_lock(&acct->lock);
+	do_acct_process(acct);
+	schedule_work(&acct->work);
+	wait_for_completion(&acct->done);
+	cmpxchg(&acct->ns->bacct, pin, NULL);
+	mutex_unlock(&acct->lock);
+	pin_remove(pin);
+	acct_put(acct);
+}
+
 static void close_work(struct work_struct *work)
 {
 	struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
@@ -168,49 +187,13 @@ static void close_work(struct work_struct *work)
 	complete(&acct->done);
 }
 
-static void acct_kill(struct bsd_acct_struct *acct)
-{
-	if (acct) {
-		struct pid_namespace *ns = acct->ns;
-		do_acct_process(acct);
-		INIT_WORK(&acct->work, close_work);
-		init_completion(&acct->done);
-		schedule_work(&acct->work);
-		wait_for_completion(&acct->done);
-		pin_remove(&acct->pin);
-		cmpxchg(&ns->bacct, acct, NULL);
-		acct->ns = NULL;
-		atomic_long_dec(&acct->count);
-		mutex_unlock(&acct->lock);
-		acct_put(acct);
-	}
-}
-
-static void acct_pin_kill(struct fs_pin *pin)
-{
-	struct bsd_acct_struct *acct;
-	acct = container_of(pin, struct bsd_acct_struct, pin);
-	if (!atomic_long_inc_not_zero(&acct->count)) {
-		rcu_read_unlock();
-		cpu_relax();
-		return;
-	}
-	rcu_read_unlock();
-	mutex_lock(&acct->lock);
-	if (!acct->ns) {
-		mutex_unlock(&acct->lock);
-		acct_put(acct);
-		acct = NULL;
-	}
-	acct_kill(acct);
-}
-
 static int acct_on(struct filename *pathname)
 {
 	struct file *file;
 	struct vfsmount *mnt, *internal;
 	struct pid_namespace *ns = task_active_pid_ns(current);
-	struct bsd_acct_struct *acct, *old;
+	struct bsd_acct_struct *acct;
+	struct fs_pin *old;
 	int err;
 
 	acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
@@ -252,18 +235,20 @@ static int acct_on(struct filename *pathname)
 	file->f_path.mnt = internal;
 
 	atomic_long_set(&acct->count, 1);
-	acct->pin.kill = acct_pin_kill;
+	init_fs_pin(&acct->pin, acct_pin_kill);
 	acct->file = file;
 	acct->needcheck = jiffies;
 	acct->ns = ns;
 	mutex_init(&acct->lock);
+	INIT_WORK(&acct->work, close_work);
+	init_completion(&acct->done);
 	mutex_lock_nested(&acct->lock, 1);	/* nobody has seen it yet */
 	pin_insert(&acct->pin, mnt);
 
-	old = acct_get(ns);
-	ns->bacct = acct;
-	acct_kill(old);
+	rcu_read_lock();
+	old = xchg(&ns->bacct, &acct->pin);
 	mutex_unlock(&acct->lock);
+	pin_kill(old);
 	mnt_drop_write(mnt);
 	mntput(mnt);
 	return 0;
@@ -299,7 +284,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 		mutex_unlock(&acct_on_mutex);
 		putname(tmp);
 	} else {
-		acct_kill(acct_get(task_active_pid_ns(current)));
+		rcu_read_lock();
+		pin_kill(task_active_pid_ns(current)->bacct);
 	}
 
 	return error;
@@ -307,7 +293,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 
 void acct_exit_ns(struct pid_namespace *ns)
 {
-	acct_kill(acct_get(ns));
+	rcu_read_lock();
+	pin_kill(ns->bacct);
 }
 
 /*
-- 
cgit v1.2.3


From 19f92b237b1700d30b788f00b16a627ffbfdf0e5 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Tue, 13 Jan 2015 14:23:25 -0800
Subject: irqchip: omap-intc: Fix support for dm814 and dm816

On dm81xx we have 128 interrupts like am33xx has. Let's add
compatible flags for dm814x and dm816x, and document the
existing binding.

As the dm81xx are booting in device tree only mode, we can now
also remove ti81xx_init_irq() legacy function.

Signed-off-by: Tony Lindgren <tony@atomide.com>
Reviewed-by: Felipe Balbi <balbi@ti.com>
Cc: Brian Hutchinson <b.hutchman@gmail.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1421187806-6804-2-git-send-email-tony@atomide.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 .../interrupt-controller/ti,omap-intc-irq.txt      | 28 ++++++++++++++++++++++
 drivers/irqchip/irq-omap-intc.c                    | 14 ++++-------
 include/linux/irqchip/irq-omap-intc.h              |  1 -
 3 files changed, 33 insertions(+), 10 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/interrupt-controller/ti,omap-intc-irq.txt

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/interrupt-controller/ti,omap-intc-irq.txt b/Documentation/devicetree/bindings/interrupt-controller/ti,omap-intc-irq.txt
new file mode 100644
index 000000000000..38ce5d037722
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/ti,omap-intc-irq.txt
@@ -0,0 +1,28 @@
+Omap2/3 intc controller
+
+On TI omap2 and 3 the intc interrupt controller can provide
+96 or 128 IRQ signals to the ARM host depending on the SoC.
+
+Required Properties:
+- compatible: should be one of
+			"ti,omap2-intc"
+			"ti,omap3-intc"
+			"ti,dm814-intc"
+			"ti,dm816-intc"
+			"ti,am33xx-intc"
+
+- interrupt-controller : Identifies the node as an interrupt controller
+- #interrupt-cells : Specifies the number of cells needed to encode interrupt
+		     source, should be 1 for intc
+- interrupts: interrupt reference to primary interrupt controller
+
+Please refer to interrupts.txt in this directory for details of the common
+Interrupt Controllers bindings used by client devices.
+
+Example:
+	intc: interrupt-controller@48200000 {
+		compatible = "ti,omap3-intc";
+		interrupt-controller;
+		#interrupt-cells = <1>;
+		reg = <0x48200000 0x1000>;
+	};
diff --git a/drivers/irqchip/irq-omap-intc.c b/drivers/irqchip/irq-omap-intc.c
index c03f140acbae..b444d0e48f1f 100644
--- a/drivers/irqchip/irq-omap-intc.c
+++ b/drivers/irqchip/irq-omap-intc.c
@@ -380,14 +380,6 @@ void __init omap3_init_irq(void)
 	set_handle_irq(omap_intc_handle_irq);
 }
 
-void __init ti81xx_init_irq(void)
-{
-	omap_nr_irqs = 96;
-	omap_nr_pending = 4;
-	omap_init_irq(OMAP34XX_IC_BASE, NULL);
-	set_handle_irq(omap_intc_handle_irq);
-}
-
 static int __init intc_of_init(struct device_node *node,
 			     struct device_node *parent)
 {
@@ -399,7 +391,9 @@ static int __init intc_of_init(struct device_node *node,
 	if (WARN_ON(!node))
 		return -ENODEV;
 
-	if (of_device_is_compatible(node, "ti,am33xx-intc")) {
+	if (of_device_is_compatible(node, "ti,dm814-intc") ||
+	    of_device_is_compatible(node, "ti,dm816-intc") ||
+	    of_device_is_compatible(node, "ti,am33xx-intc")) {
 		omap_nr_irqs = 128;
 		omap_nr_pending = 4;
 	}
@@ -415,4 +409,6 @@ static int __init intc_of_init(struct device_node *node,
 
 IRQCHIP_DECLARE(omap2_intc, "ti,omap2-intc", intc_of_init);
 IRQCHIP_DECLARE(omap3_intc, "ti,omap3-intc", intc_of_init);
+IRQCHIP_DECLARE(dm814x_intc, "ti,dm814-intc", intc_of_init);
+IRQCHIP_DECLARE(dm816x_intc, "ti,dm816-intc", intc_of_init);
 IRQCHIP_DECLARE(am33xx_intc, "ti,am33xx-intc", intc_of_init);
diff --git a/include/linux/irqchip/irq-omap-intc.h b/include/linux/irqchip/irq-omap-intc.h
index e06b370cfc0d..bda426ab0ab7 100644
--- a/include/linux/irqchip/irq-omap-intc.h
+++ b/include/linux/irqchip/irq-omap-intc.h
@@ -20,7 +20,6 @@
 
 void omap2_init_irq(void);
 void omap3_init_irq(void);
-void ti81xx_init_irq(void);
 
 int omap_irq_pending(void);
 void omap_intc_save_context(void);
-- 
cgit v1.2.3


From c7f2a2ac377626897ad68e63d24d85ee21f47bb1 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Tue, 13 Jan 2015 14:23:26 -0800
Subject: irqchip: omap-intc: Remove unused legacy interface for omap2

Nowadays omap2 is booting in device tree only mode so there is no
need to keep the legacy interface around for omap2_init_irq().

Signed-off-by: Tony Lindgren <tony@atomide.com>
Reviewed-by: Felipe Balbi <balbi@ti.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1421187806-6804-3-git-send-email-tony@atomide.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/irqchip/irq-omap-intc.c       | 8 --------
 include/linux/irqchip/irq-omap-intc.h | 1 -
 2 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-omap-intc.c b/drivers/irqchip/irq-omap-intc.c
index b444d0e48f1f..a569c6dbd1d1 100644
--- a/drivers/irqchip/irq-omap-intc.c
+++ b/drivers/irqchip/irq-omap-intc.c
@@ -364,14 +364,6 @@ out:
 		omap_ack_irq(NULL);
 }
 
-void __init omap2_init_irq(void)
-{
-	omap_nr_irqs = 96;
-	omap_nr_pending = 3;
-	omap_init_irq(OMAP24XX_IC_BASE, NULL);
-	set_handle_irq(omap_intc_handle_irq);
-}
-
 void __init omap3_init_irq(void)
 {
 	omap_nr_irqs = 96;
diff --git a/include/linux/irqchip/irq-omap-intc.h b/include/linux/irqchip/irq-omap-intc.h
index bda426ab0ab7..2e3d1afeb674 100644
--- a/include/linux/irqchip/irq-omap-intc.h
+++ b/include/linux/irqchip/irq-omap-intc.h
@@ -18,7 +18,6 @@
 #ifndef __INCLUDE_LINUX_IRQCHIP_IRQ_OMAP_INTC_H
 #define __INCLUDE_LINUX_IRQCHIP_IRQ_OMAP_INTC_H
 
-void omap2_init_irq(void);
 void omap3_init_irq(void);
 
 int omap_irq_pending(void);
-- 
cgit v1.2.3


From 7aea8389a77abf9fde254aca2434a605c7704f58 Mon Sep 17 00:00:00 2001
From: Jacek Anaszewski <j.anaszewski@samsung.com>
Date: Fri, 9 Jan 2015 07:22:51 -0800
Subject: leds: Add LED Flash class extension to the LED subsystem

Some LED devices support two operation modes - torch and flash.
This patch provides support for flash LED devices in the LED subsystem
by introducing new sysfs attributes and kernel internal interface.
The attributes being introduced are: flash_brightness, flash_strobe,
flash_timeout, max_flash_timeout, max_flash_brightness, flash_fault,
flash_sync_strobe and available_sync_leds. All the flash related
features are placed in a separate module.

The modifications aim to be compatible with V4L2 framework requirements
related to the flash devices management. The design assumes that V4L2
sub-device can take of the LED class device control and communicate
with it through the kernel internal interface. When V4L2 Flash sub-device
file is opened, the LED class device sysfs interface is made
unavailable.

Signed-off-by: Jacek Anaszewski <j.anaszewski@samsung.com>
Acked-by: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Richard Purdie <rpurdie@rpsys.net>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Bryan Wu <cooloney@gmail.com>
---
 drivers/leds/Kconfig            |  10 +
 drivers/leds/Makefile           |   1 +
 drivers/leds/led-class-flash.c  | 486 ++++++++++++++++++++++++++++++++++++++++
 drivers/leds/led-class.c        |   4 +
 include/linux/led-class-flash.h | 207 +++++++++++++++++
 include/linux/leds.h            |   3 +
 6 files changed, 711 insertions(+)
 create mode 100644 drivers/leds/led-class-flash.c
 create mode 100644 include/linux/led-class-flash.h

(limited to 'include/linux')

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index a6c3d2f153f3..25b320d64e26 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -22,6 +22,16 @@ config LEDS_CLASS
 	  This option enables the led sysfs class in /sys/class/leds.  You'll
 	  need this to do anything useful with LEDs.  If unsure, say N.
 
+config LEDS_CLASS_FLASH
+	tristate "LED Flash Class Support"
+	depends on LEDS_CLASS
+	help
+	  This option enables the flash led sysfs class in /sys/class/leds.
+	  It wrapps LED Class and adds flash LEDs specific sysfs attributes
+	  and kernel internal API to it. You'll need this to provide support
+	  for the flash related features of a LED device. It can be built
+	  as a module.
+
 comment "LED drivers"
 
 config LEDS_88PM860X
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index 1c65a191d907..cbba921b6f1c 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -2,6 +2,7 @@
 # LED Core
 obj-$(CONFIG_NEW_LEDS)			+= led-core.o
 obj-$(CONFIG_LEDS_CLASS)		+= led-class.o
+obj-$(CONFIG_LEDS_CLASS_FLASH)		+= led-class-flash.o
 obj-$(CONFIG_LEDS_TRIGGERS)		+= led-triggers.o
 
 # LED Platform Drivers
diff --git a/drivers/leds/led-class-flash.c b/drivers/leds/led-class-flash.c
new file mode 100644
index 000000000000..4a19fd44f93f
--- /dev/null
+++ b/drivers/leds/led-class-flash.c
@@ -0,0 +1,486 @@
+/*
+ * LED Flash class interface
+ *
+ * Copyright (C) 2015 Samsung Electronics Co., Ltd.
+ * Author: Jacek Anaszewski <j.anaszewski@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/led-class-flash.h>
+#include <linux/leds.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "leds.h"
+
+#define has_flash_op(fled_cdev, op)				\
+	(fled_cdev && fled_cdev->ops->op)
+
+#define call_flash_op(fled_cdev, op, args...)		\
+	((has_flash_op(fled_cdev, op)) ?			\
+			(fled_cdev->ops->op(fled_cdev, args)) :	\
+			-EINVAL)
+
+static const char * const led_flash_fault_names[] = {
+	"led-over-voltage",
+	"flash-timeout-exceeded",
+	"controller-over-temperature",
+	"controller-short-circuit",
+	"led-power-supply-over-current",
+	"indicator-led-fault",
+	"led-under-voltage",
+	"controller-under-voltage",
+	"led-over-temperature",
+};
+
+static ssize_t flash_brightness_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	unsigned long state;
+	ssize_t ret;
+
+	mutex_lock(&led_cdev->led_access);
+
+	if (led_sysfs_is_disabled(led_cdev)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	ret = kstrtoul(buf, 10, &state);
+	if (ret)
+		goto unlock;
+
+	ret = led_set_flash_brightness(fled_cdev, state);
+	if (ret < 0)
+		goto unlock;
+
+	ret = size;
+unlock:
+	mutex_unlock(&led_cdev->led_access);
+	return ret;
+}
+
+static ssize_t flash_brightness_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+
+	/* no lock needed for this */
+	led_update_flash_brightness(fled_cdev);
+
+	return sprintf(buf, "%u\n", fled_cdev->brightness.val);
+}
+static DEVICE_ATTR_RW(flash_brightness);
+
+static ssize_t max_flash_brightness_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+
+	return sprintf(buf, "%u\n", fled_cdev->brightness.max);
+}
+static DEVICE_ATTR_RO(max_flash_brightness);
+
+static ssize_t flash_strobe_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	unsigned long state;
+	ssize_t ret = -EINVAL;
+
+	mutex_lock(&led_cdev->led_access);
+
+	if (led_sysfs_is_disabled(led_cdev)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	ret = kstrtoul(buf, 10, &state);
+	if (ret)
+		goto unlock;
+
+	if (state < 0 || state > 1) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	ret = led_set_flash_strobe(fled_cdev, state);
+	if (ret < 0)
+		goto unlock;
+	ret = size;
+unlock:
+	mutex_unlock(&led_cdev->led_access);
+	return ret;
+}
+
+static ssize_t flash_strobe_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	bool state;
+	int ret;
+
+	/* no lock needed for this */
+	ret = led_get_flash_strobe(fled_cdev, &state);
+	if (ret < 0)
+		return ret;
+
+	return sprintf(buf, "%u\n", state);
+}
+static DEVICE_ATTR_RW(flash_strobe);
+
+static ssize_t flash_timeout_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	unsigned long flash_timeout;
+	ssize_t ret;
+
+	mutex_lock(&led_cdev->led_access);
+
+	if (led_sysfs_is_disabled(led_cdev)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	ret = kstrtoul(buf, 10, &flash_timeout);
+	if (ret)
+		goto unlock;
+
+	ret = led_set_flash_timeout(fled_cdev, flash_timeout);
+	if (ret < 0)
+		goto unlock;
+
+	ret = size;
+unlock:
+	mutex_unlock(&led_cdev->led_access);
+	return ret;
+}
+
+static ssize_t flash_timeout_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+
+	return sprintf(buf, "%u\n", fled_cdev->timeout.val);
+}
+static DEVICE_ATTR_RW(flash_timeout);
+
+static ssize_t max_flash_timeout_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+
+	return sprintf(buf, "%u\n", fled_cdev->timeout.max);
+}
+static DEVICE_ATTR_RO(max_flash_timeout);
+
+static ssize_t flash_fault_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	u32 fault, mask = 0x1;
+	char *pbuf = buf;
+	int i, ret, buf_len;
+
+	ret = led_get_flash_fault(fled_cdev, &fault);
+	if (ret < 0)
+		return -EINVAL;
+
+	*buf = '\0';
+
+	for (i = 0; i < LED_NUM_FLASH_FAULTS; ++i) {
+		if (fault & mask) {
+			buf_len = sprintf(pbuf, "%s ",
+					  led_flash_fault_names[i]);
+			pbuf += buf_len;
+		}
+		mask <<= 1;
+	}
+
+	return sprintf(buf, "%s\n", buf);
+}
+static DEVICE_ATTR_RO(flash_fault);
+
+static ssize_t available_sync_leds_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	char *pbuf = buf;
+	int i, buf_len;
+
+	buf_len = sprintf(pbuf, "[0: none] ");
+	pbuf += buf_len;
+
+	for (i = 0; i < fled_cdev->num_sync_leds; ++i) {
+		buf_len = sprintf(pbuf, "[%d: %s] ", i + 1,
+				  fled_cdev->sync_leds[i]->led_cdev.name);
+		pbuf += buf_len;
+	}
+
+	return sprintf(buf, "%s\n", buf);
+}
+static DEVICE_ATTR_RO(available_sync_leds);
+
+static ssize_t flash_sync_strobe_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	unsigned long led_id;
+	ssize_t ret;
+
+	mutex_lock(&led_cdev->led_access);
+
+	if (led_sysfs_is_disabled(led_cdev)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	ret = kstrtoul(buf, 10, &led_id);
+	if (ret)
+		goto unlock;
+
+	if (led_id > fled_cdev->num_sync_leds) {
+		ret = -ERANGE;
+		goto unlock;
+	}
+
+	fled_cdev->sync_led_id = led_id;
+
+	ret = size;
+unlock:
+	mutex_unlock(&led_cdev->led_access);
+	return ret;
+}
+
+static ssize_t flash_sync_strobe_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	int sled_id = fled_cdev->sync_led_id;
+	char *sync_led_name = "none";
+
+	if (fled_cdev->sync_led_id > 0)
+		sync_led_name = (char *)
+			fled_cdev->sync_leds[sled_id - 1]->led_cdev.name;
+
+	return sprintf(buf, "[%d: %s]\n", sled_id, sync_led_name);
+}
+static DEVICE_ATTR_RW(flash_sync_strobe);
+
+static struct attribute *led_flash_strobe_attrs[] = {
+	&dev_attr_flash_strobe.attr,
+	NULL,
+};
+
+static struct attribute *led_flash_timeout_attrs[] = {
+	&dev_attr_flash_timeout.attr,
+	&dev_attr_max_flash_timeout.attr,
+	NULL,
+};
+
+static struct attribute *led_flash_brightness_attrs[] = {
+	&dev_attr_flash_brightness.attr,
+	&dev_attr_max_flash_brightness.attr,
+	NULL,
+};
+
+static struct attribute *led_flash_fault_attrs[] = {
+	&dev_attr_flash_fault.attr,
+	NULL,
+};
+
+static struct attribute *led_flash_sync_strobe_attrs[] = {
+	&dev_attr_available_sync_leds.attr,
+	&dev_attr_flash_sync_strobe.attr,
+	NULL,
+};
+
+static const struct attribute_group led_flash_strobe_group = {
+	.attrs = led_flash_strobe_attrs,
+};
+
+static const struct attribute_group led_flash_timeout_group = {
+	.attrs = led_flash_timeout_attrs,
+};
+
+static const struct attribute_group led_flash_brightness_group = {
+	.attrs = led_flash_brightness_attrs,
+};
+
+static const struct attribute_group led_flash_fault_group = {
+	.attrs = led_flash_fault_attrs,
+};
+
+static const struct attribute_group led_flash_sync_strobe_group = {
+	.attrs = led_flash_sync_strobe_attrs,
+};
+
+static void led_flash_resume(struct led_classdev *led_cdev)
+{
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+
+	call_flash_op(fled_cdev, flash_brightness_set,
+					fled_cdev->brightness.val);
+	call_flash_op(fled_cdev, timeout_set, fled_cdev->timeout.val);
+}
+
+static void led_flash_init_sysfs_groups(struct led_classdev_flash *fled_cdev)
+{
+	struct led_classdev *led_cdev = &fled_cdev->led_cdev;
+	const struct led_flash_ops *ops = fled_cdev->ops;
+	const struct attribute_group **flash_groups = fled_cdev->sysfs_groups;
+
+	int num_sysfs_groups = 0;
+
+	flash_groups[num_sysfs_groups++] = &led_flash_strobe_group;
+
+	if (ops->flash_brightness_set)
+		flash_groups[num_sysfs_groups++] = &led_flash_brightness_group;
+
+	if (ops->timeout_set)
+		flash_groups[num_sysfs_groups++] = &led_flash_timeout_group;
+
+	if (ops->fault_get)
+		flash_groups[num_sysfs_groups++] = &led_flash_fault_group;
+
+	if (led_cdev->flags & LED_DEV_CAP_SYNC_STROBE)
+		flash_groups[num_sysfs_groups++] = &led_flash_sync_strobe_group;
+
+	led_cdev->groups = flash_groups;
+}
+
+int led_classdev_flash_register(struct device *parent,
+				struct led_classdev_flash *fled_cdev)
+{
+	struct led_classdev *led_cdev;
+	const struct led_flash_ops *ops;
+	int ret;
+
+	if (!fled_cdev)
+		return -EINVAL;
+
+	led_cdev = &fled_cdev->led_cdev;
+
+	if (led_cdev->flags & LED_DEV_CAP_FLASH) {
+		if (!led_cdev->brightness_set_sync)
+			return -EINVAL;
+
+		ops = fled_cdev->ops;
+		if (!ops || !ops->strobe_set)
+			return -EINVAL;
+
+		led_cdev->flash_resume = led_flash_resume;
+
+		/* Select the sysfs attributes to be created for the device */
+		led_flash_init_sysfs_groups(fled_cdev);
+	}
+
+	/* Register led class device */
+	ret = led_classdev_register(parent, led_cdev);
+	if (ret < 0)
+		return ret;
+
+	/* Setting a torch brightness needs to have immediate effect */
+	led_cdev->flags &= ~SET_BRIGHTNESS_ASYNC;
+	led_cdev->flags |= SET_BRIGHTNESS_SYNC;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(led_classdev_flash_register);
+
+void led_classdev_flash_unregister(struct led_classdev_flash *fled_cdev)
+{
+	if (!fled_cdev)
+		return;
+
+	led_classdev_unregister(&fled_cdev->led_cdev);
+}
+EXPORT_SYMBOL_GPL(led_classdev_flash_unregister);
+
+static void led_clamp_align(struct led_flash_setting *s)
+{
+	u32 v, offset;
+
+	v = s->val + s->step / 2;
+	v = clamp(v, s->min, s->max);
+	offset = v - s->min;
+	offset = s->step * (offset / s->step);
+	s->val = s->min + offset;
+}
+
+int led_set_flash_timeout(struct led_classdev_flash *fled_cdev, u32 timeout)
+{
+	struct led_classdev *led_cdev = &fled_cdev->led_cdev;
+	struct led_flash_setting *s = &fled_cdev->timeout;
+
+	s->val = timeout;
+	led_clamp_align(s);
+
+	if (!(led_cdev->flags & LED_SUSPENDED))
+		return call_flash_op(fled_cdev, timeout_set, s->val);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(led_set_flash_timeout);
+
+int led_get_flash_fault(struct led_classdev_flash *fled_cdev, u32 *fault)
+{
+	return call_flash_op(fled_cdev, fault_get, fault);
+}
+EXPORT_SYMBOL_GPL(led_get_flash_fault);
+
+int led_set_flash_brightness(struct led_classdev_flash *fled_cdev,
+				u32 brightness)
+{
+	struct led_classdev *led_cdev = &fled_cdev->led_cdev;
+	struct led_flash_setting *s = &fled_cdev->brightness;
+
+	s->val = brightness;
+	led_clamp_align(s);
+
+	if (!(led_cdev->flags & LED_SUSPENDED))
+		return call_flash_op(fled_cdev, flash_brightness_set, s->val);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(led_set_flash_brightness);
+
+int led_update_flash_brightness(struct led_classdev_flash *fled_cdev)
+{
+	struct led_flash_setting *s = &fled_cdev->brightness;
+	u32 brightness;
+
+	if (has_flash_op(fled_cdev, flash_brightness_get)) {
+		int ret = call_flash_op(fled_cdev, flash_brightness_get,
+						&brightness);
+		if (ret < 0)
+			return ret;
+
+		s->val = brightness;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(led_update_flash_brightness);
+
+MODULE_AUTHOR("Jacek Anaszewski <j.anaszewski@samsung.com>");
+MODULE_DESCRIPTION("LED Flash class interface");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 291ca45ce8ba..795ec994c663 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -179,6 +179,10 @@ EXPORT_SYMBOL_GPL(led_classdev_suspend);
 void led_classdev_resume(struct led_classdev *led_cdev)
 {
 	led_cdev->brightness_set(led_cdev, led_cdev->brightness);
+
+	if (led_cdev->flash_resume)
+		led_cdev->flash_resume(led_cdev);
+
 	led_cdev->flags &= ~LED_SUSPENDED;
 }
 EXPORT_SYMBOL_GPL(led_classdev_resume);
diff --git a/include/linux/led-class-flash.h b/include/linux/led-class-flash.h
new file mode 100644
index 000000000000..5ba2facd7a51
--- /dev/null
+++ b/include/linux/led-class-flash.h
@@ -0,0 +1,207 @@
+/*
+ * LED Flash class interface
+ *
+ * Copyright (C) 2015 Samsung Electronics Co., Ltd.
+ * Author: Jacek Anaszewski <j.anaszewski@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#ifndef __LINUX_FLASH_LEDS_H_INCLUDED
+#define __LINUX_FLASH_LEDS_H_INCLUDED
+
+#include <linux/leds.h>
+#include <uapi/linux/v4l2-controls.h>
+
+struct device_node;
+struct led_classdev_flash;
+
+/*
+ * Supported led fault bits - must be kept in synch
+ * with V4L2_FLASH_FAULT bits.
+ */
+#define LED_FAULT_OVER_VOLTAGE		(1 << 0)
+#define LED_FAULT_TIMEOUT		(1 << 1)
+#define LED_FAULT_OVER_TEMPERATURE	(1 << 2)
+#define LED_FAULT_SHORT_CIRCUIT		(1 << 3)
+#define LED_FAULT_OVER_CURRENT		(1 << 4)
+#define LED_FAULT_INDICATOR		(1 << 5)
+#define LED_FAULT_UNDER_VOLTAGE		(1 << 6)
+#define LED_FAULT_INPUT_VOLTAGE		(1 << 7)
+#define LED_FAULT_LED_OVER_TEMPERATURE	(1 << 8)
+#define LED_NUM_FLASH_FAULTS		9
+
+#define LED_FLASH_MAX_SYSFS_GROUPS 7
+
+struct led_flash_ops {
+	/* set flash brightness */
+	int (*flash_brightness_set)(struct led_classdev_flash *fled_cdev,
+					u32 brightness);
+	/* get flash brightness */
+	int (*flash_brightness_get)(struct led_classdev_flash *fled_cdev,
+					u32 *brightness);
+	/* set flash strobe state */
+	int (*strobe_set)(struct led_classdev_flash *fled_cdev, bool state);
+	/* get flash strobe state */
+	int (*strobe_get)(struct led_classdev_flash *fled_cdev, bool *state);
+	/* set flash timeout */
+	int (*timeout_set)(struct led_classdev_flash *fled_cdev, u32 timeout);
+	/* get the flash LED fault */
+	int (*fault_get)(struct led_classdev_flash *fled_cdev, u32 *fault);
+};
+
+/*
+ * Current value of a flash setting along
+ * with its constraints.
+ */
+struct led_flash_setting {
+	/* maximum allowed value */
+	u32 min;
+	/* maximum allowed value */
+	u32 max;
+	/* step value */
+	u32 step;
+	/* current value */
+	u32 val;
+};
+
+struct led_classdev_flash {
+	/* led class device */
+	struct led_classdev led_cdev;
+
+	/* flash led specific ops */
+	const struct led_flash_ops *ops;
+
+	/* flash brightness value in microamperes along with its constraints */
+	struct led_flash_setting brightness;
+
+	/* flash timeout value in microseconds along with its constraints */
+	struct led_flash_setting timeout;
+
+	/* LED Flash class sysfs groups */
+	const struct attribute_group *sysfs_groups[LED_FLASH_MAX_SYSFS_GROUPS];
+
+	/* LEDs available for flash strobe synchronization */
+	struct led_classdev_flash **sync_leds;
+
+	/* Number of LEDs available for flash strobe synchronization */
+	int num_sync_leds;
+
+	/*
+	 * The identifier of the sub-led to synchronize the flash strobe with.
+	 * Identifiers start from 1, which reflects the first element from the
+	 * sync_leds array. 0 means that the flash strobe should not be
+	 * synchronized.
+	 */
+	u32 sync_led_id;
+};
+
+static inline struct led_classdev_flash *lcdev_to_flcdev(
+						struct led_classdev *lcdev)
+{
+	return container_of(lcdev, struct led_classdev_flash, led_cdev);
+}
+
+/**
+ * led_classdev_flash_register - register a new object of led_classdev class
+ *				 with support for flash LEDs
+ * @parent: the flash LED to register
+ * @fled_cdev: the led_classdev_flash structure for this device
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+extern int led_classdev_flash_register(struct device *parent,
+				struct led_classdev_flash *fled_cdev);
+
+/**
+ * led_classdev_flash_unregister - unregisters an object of led_classdev class
+ *				   with support for flash LEDs
+ * @fled_cdev: the flash LED to unregister
+ *
+ * Unregister a previously registered via led_classdev_flash_register object
+ */
+extern void led_classdev_flash_unregister(struct led_classdev_flash *fled_cdev);
+
+/**
+ * led_set_flash_strobe - setup flash strobe
+ * @fled_cdev: the flash LED to set strobe on
+ * @state: 1 - strobe flash, 0 - stop flash strobe
+ *
+ * Strobe the flash LED.
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+static inline int led_set_flash_strobe(struct led_classdev_flash *fled_cdev,
+					bool state)
+{
+	return fled_cdev->ops->strobe_set(fled_cdev, state);
+}
+
+/**
+ * led_get_flash_strobe - get flash strobe status
+ * @fled_cdev: the flash LED to query
+ * @state: 1 - flash is strobing, 0 - flash is off
+ *
+ * Check whether the flash is strobing at the moment.
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+static inline int led_get_flash_strobe(struct led_classdev_flash *fled_cdev,
+					bool *state)
+{
+	if (fled_cdev->ops->strobe_get)
+		return fled_cdev->ops->strobe_get(fled_cdev, state);
+
+	return -EINVAL;
+}
+
+/**
+ * led_set_flash_brightness - set flash LED brightness
+ * @fled_cdev: the flash LED to set
+ * @brightness: the brightness to set it to
+ *
+ * Set a flash LED's brightness.
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+extern int led_set_flash_brightness(struct led_classdev_flash *fled_cdev,
+					u32 brightness);
+
+/**
+ * led_update_flash_brightness - update flash LED brightness
+ * @fled_cdev: the flash LED to query
+ *
+ * Get a flash LED's current brightness and update led_flash->brightness
+ * member with the obtained value.
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+extern int led_update_flash_brightness(struct led_classdev_flash *fled_cdev);
+
+/**
+ * led_set_flash_timeout - set flash LED timeout
+ * @fled_cdev: the flash LED to set
+ * @timeout: the flash timeout to set it to
+ *
+ * Set the flash strobe duration.
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+extern int led_set_flash_timeout(struct led_classdev_flash *fled_cdev,
+					u32 timeout);
+
+/**
+ * led_get_flash_fault - get the flash LED fault
+ * @fled_cdev: the flash LED to query
+ * @fault: bitmask containing flash faults
+ *
+ * Get the flash LED fault.
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+extern int led_get_flash_fault(struct led_classdev_flash *fled_cdev,
+					u32 *fault);
+
+#endif	/* __LINUX_FLASH_LEDS_H_INCLUDED */
diff --git a/include/linux/leds.h b/include/linux/leds.h
index cfceef32c9b3..f70f84f35674 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -46,6 +46,8 @@ struct led_classdev {
 #define LED_SYSFS_DISABLE	(1 << 20)
 #define SET_BRIGHTNESS_ASYNC	(1 << 21)
 #define SET_BRIGHTNESS_SYNC	(1 << 22)
+#define LED_DEV_CAP_FLASH	(1 << 23)
+#define LED_DEV_CAP_SYNC_STROBE	(1 << 24)
 
 	/* Set LED brightness level */
 	/* Must not sleep, use a workqueue if needed */
@@ -81,6 +83,7 @@ struct led_classdev {
 	unsigned long		 blink_delay_on, blink_delay_off;
 	struct timer_list	 blink_timer;
 	int			 blink_brightness;
+	void			(*flash_resume)(struct led_classdev *led_cdev);
 
 	struct work_struct	set_brightness_work;
 	int			delayed_set_value;
-- 
cgit v1.2.3


From a2a15d54ab2d5c816deea306ae8e9101c40fe300 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sat, 3 Jan 2015 09:51:33 -0800
Subject: device: Fix dev_dbg_once macro

There is a copy/paste typo in the dev_dbg_once macro.

It uses dev_info instead of dev_dbg, so use the correct
function instead.

Signed-off-by: Joe Perches <joe@perches.com>
Noticed-by: Marc Finet <m.dreadlock@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index fb506738f7b7..7b2532361398 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1156,7 +1156,7 @@ do {									\
 #define dev_info_once(dev, fmt, ...)					\
 	dev_level_once(dev_info, dev, fmt, ##__VA_ARGS__)
 #define dev_dbg_once(dev, fmt, ...)					\
-	dev_level_once(dev_info, dev, fmt, ##__VA_ARGS__)
+	dev_level_once(dev_dbg, dev, fmt, ##__VA_ARGS__)
 
 #define dev_level_ratelimited(dev_level, dev, fmt, ...)			\
 do {									\
-- 
cgit v1.2.3


From d1f1052c520452494db548b1a701cd9020b5903e Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 25 Dec 2014 15:07:04 -0800
Subject: device: Change dev_<level> logging functions to return void

No caller or macro uses the return value so make all
the functions return void.

Compiled x86 allyesconfig and defconfig w/o CONFIG_PRINTK

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 29 +++++++++---------------
 include/linux/device.h | 60 ++++++++++++++++++++++++--------------------------
 2 files changed, 40 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 97e2baf6e5d8..07304a3b9ee2 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2080,54 +2080,47 @@ int dev_printk_emit(int level, const struct device *dev, const char *fmt, ...)
 }
 EXPORT_SYMBOL(dev_printk_emit);
 
-static int __dev_printk(const char *level, const struct device *dev,
+static void __dev_printk(const char *level, const struct device *dev,
 			struct va_format *vaf)
 {
-	if (!dev)
-		return printk("%s(NULL device *): %pV", level, vaf);
-
-	return dev_printk_emit(level[1] - '0', dev,
-			       "%s %s: %pV",
-			       dev_driver_string(dev), dev_name(dev), vaf);
+	if (dev)
+		dev_printk_emit(level[1] - '0', dev, "%s %s: %pV",
+				dev_driver_string(dev), dev_name(dev), vaf);
+	else
+		printk("%s(NULL device *): %pV", level, vaf);
 }
 
-int dev_printk(const char *level, const struct device *dev,
-	       const char *fmt, ...)
+void dev_printk(const char *level, const struct device *dev,
+		const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
-	int r;
 
 	va_start(args, fmt);
 
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
-	r = __dev_printk(level, dev, &vaf);
+	__dev_printk(level, dev, &vaf);
 
 	va_end(args);
-
-	return r;
 }
 EXPORT_SYMBOL(dev_printk);
 
 #define define_dev_printk_level(func, kern_level)		\
-int func(const struct device *dev, const char *fmt, ...)	\
+void func(const struct device *dev, const char *fmt, ...)	\
 {								\
 	struct va_format vaf;					\
 	va_list args;						\
-	int r;							\
 								\
 	va_start(args, fmt);					\
 								\
 	vaf.fmt = fmt;						\
 	vaf.va = &args;						\
 								\
-	r = __dev_printk(kern_level, dev, &vaf);		\
+	__dev_printk(kern_level, dev, &vaf);			\
 								\
 	va_end(args);						\
-								\
-	return r;						\
 }								\
 EXPORT_SYMBOL(func);
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 7b2532361398..0eb8ee2dc6d1 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1038,22 +1038,22 @@ extern __printf(3, 4)
 int dev_printk_emit(int level, const struct device *dev, const char *fmt, ...);
 
 extern __printf(3, 4)
-int dev_printk(const char *level, const struct device *dev,
-	       const char *fmt, ...);
+void dev_printk(const char *level, const struct device *dev,
+		const char *fmt, ...);
 extern __printf(2, 3)
-int dev_emerg(const struct device *dev, const char *fmt, ...);
+void dev_emerg(const struct device *dev, const char *fmt, ...);
 extern __printf(2, 3)
-int dev_alert(const struct device *dev, const char *fmt, ...);
+void dev_alert(const struct device *dev, const char *fmt, ...);
 extern __printf(2, 3)
-int dev_crit(const struct device *dev, const char *fmt, ...);
+void dev_crit(const struct device *dev, const char *fmt, ...);
 extern __printf(2, 3)
-int dev_err(const struct device *dev, const char *fmt, ...);
+void dev_err(const struct device *dev, const char *fmt, ...);
 extern __printf(2, 3)
-int dev_warn(const struct device *dev, const char *fmt, ...);
+void dev_warn(const struct device *dev, const char *fmt, ...);
 extern __printf(2, 3)
-int dev_notice(const struct device *dev, const char *fmt, ...);
+void dev_notice(const struct device *dev, const char *fmt, ...);
 extern __printf(2, 3)
-int _dev_info(const struct device *dev, const char *fmt, ...);
+void _dev_info(const struct device *dev, const char *fmt, ...);
 
 #else
 
@@ -1065,35 +1065,35 @@ static inline __printf(3, 4)
 int dev_printk_emit(int level, const struct device *dev, const char *fmt, ...)
 { return 0; }
 
-static inline int __dev_printk(const char *level, const struct device *dev,
-			       struct va_format *vaf)
-{ return 0; }
+static inline void __dev_printk(const char *level, const struct device *dev,
+				struct va_format *vaf)
+{}
 static inline __printf(3, 4)
-int dev_printk(const char *level, const struct device *dev,
-	       const char *fmt, ...)
-{ return 0; }
+void dev_printk(const char *level, const struct device *dev,
+		const char *fmt, ...)
+{}
 
 static inline __printf(2, 3)
-int dev_emerg(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void dev_emerg(const struct device *dev, const char *fmt, ...)
+{}
 static inline __printf(2, 3)
-int dev_crit(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void dev_crit(const struct device *dev, const char *fmt, ...)
+{}
 static inline __printf(2, 3)
-int dev_alert(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void dev_alert(const struct device *dev, const char *fmt, ...)
+{}
 static inline __printf(2, 3)
-int dev_err(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void dev_err(const struct device *dev, const char *fmt, ...)
+{}
 static inline __printf(2, 3)
-int dev_warn(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void dev_warn(const struct device *dev, const char *fmt, ...)
+{}
 static inline __printf(2, 3)
-int dev_notice(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void dev_notice(const struct device *dev, const char *fmt, ...)
+{}
 static inline __printf(2, 3)
-int _dev_info(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void _dev_info(const struct device *dev, const char *fmt, ...)
+{}
 
 #endif
 
@@ -1119,7 +1119,6 @@ do {						     \
 ({								\
 	if (0)							\
 		dev_printk(KERN_DEBUG, dev, format, ##arg);	\
-	0;							\
 })
 #endif
 
@@ -1215,7 +1214,6 @@ do {									\
 ({								\
 	if (0)							\
 		dev_printk(KERN_DEBUG, dev, format, ##arg);	\
-	0;							\
 })
 #endif
 
-- 
cgit v1.2.3


From 32357605ce7bcd36cb6215a21cad12cd8f51b74e Mon Sep 17 00:00:00 2001
From: Sharon Dvir <sharon.dvir1@mail.huji.ac.il>
Date: Thu, 22 Jan 2015 12:15:25 +0200
Subject: USB: Add missing word to comment in mod_devicetable.h

The documentation of match_flags in struct usb_device_id said:
'Bit mask controlling of the other fields are used to match against new devices.'
Changed to:
'Bit mask controlling which of the other fields are used to match against new devices.'
By adding the word 'which' and editing the next lines to not exceed 80 chars.

Signed-off-by: Sharon Dvir <sharon.dvir1@mail.huji.ac.il>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mod_devicetable.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 745def862580..470a240f66a1 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -53,9 +53,9 @@ struct ieee1394_device_id {
 
 /**
  * struct usb_device_id - identifies USB devices for probing and hotplugging
- * @match_flags: Bit mask controlling of the other fields are used to match
- *	against new devices.  Any field except for driver_info may be used,
- *	although some only make sense in conjunction with other fields.
+ * @match_flags: Bit mask controlling which of the other fields are used to
+ *	match against new devices. Any field except for driver_info may be
+ *	used, although some only make sense in conjunction with other fields.
  *	This is usually set by a USB_DEVICE_*() macro, which sets all
  *	other fields in this structure except for driver_info.
  * @idVendor: USB vendor ID for a device; numbers are assigned
-- 
cgit v1.2.3


From f4c5cf88fbd50e4779042268947b2e2f90c20484 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 18 Dec 2014 15:29:14 +0100
Subject: gpu: host1x: Provide a proper struct bus_type

Previously the struct bus_type exported by the host1x infrastructure was
only a very basic skeleton. Turn that implementation into a more full-
fledged bus to support proper probe ordering and power management.

Note that the bus infrastructure needs to be available before any of the
drivers can be registered. This is automatically ensured if all drivers
are built as loadable modules (via symbol dependencies). If all drivers
are built-in there are no such guarantees and the link order determines
the initcall ordering. Adjust drivers/gpu/Makefile to make sure that the
host1x bus infrastructure is initialized prior to any of its users (only
drm/tegra currently).

v2: Fix building host1x and tegra-drm as modules
    Reported-by: Dave Airlie <airlied@gmail.com>

Reviewed-by: Sean Paul <seanpaul@chromium.org>
Reviewed-by: Mark Zhang <markz@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/Makefile        |   5 +-
 drivers/gpu/drm/tegra/drm.c |   4 +-
 drivers/gpu/host1x/bus.c    | 111 ++++++++++++++++++++++++++++++--------------
 drivers/gpu/host1x/bus.h    |   4 +-
 drivers/gpu/host1x/dev.c    |   6 +--
 include/linux/host1x.h      |  18 +++++--
 6 files changed, 103 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/Makefile b/drivers/gpu/Makefile
index 70da9eb52a42..e9ed439a5b65 100644
--- a/drivers/gpu/Makefile
+++ b/drivers/gpu/Makefile
@@ -1,3 +1,6 @@
-obj-y			+= drm/ vga/
+# drm/tegra depends on host1x, so if both drivers are built-in care must be
+# taken to initialize them in the correct order. Link order is the only way
+# to ensure this currently.
 obj-$(CONFIG_TEGRA_HOST1X)	+= host1x/
+obj-y			+= drm/ vga/
 obj-$(CONFIG_IMX_IPUV3_CORE)	+= ipu-v3/
diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index d4f827593dfa..a0c18ae79029 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -912,7 +912,9 @@ static const struct of_device_id host1x_drm_subdevs[] = {
 };
 
 static struct host1x_driver host1x_drm_driver = {
-	.name = "drm",
+	.driver = {
+		.name = "drm",
+	},
 	.probe = host1x_drm_probe,
 	.remove = host1x_drm_remove,
 	.subdevs = host1x_drm_subdevs,
diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index 0b52f0ea8871..4a99c6416e6a 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -72,13 +72,14 @@ static void host1x_subdev_del(struct host1x_subdev *subdev)
 /**
  * host1x_device_parse_dt() - scan device tree and add matching subdevices
  */
-static int host1x_device_parse_dt(struct host1x_device *device)
+static int host1x_device_parse_dt(struct host1x_device *device,
+				  struct host1x_driver *driver)
 {
 	struct device_node *np;
 	int err;
 
 	for_each_child_of_node(device->dev.parent->of_node, np) {
-		if (of_match_node(device->driver->subdevs, np) &&
+		if (of_match_node(driver->subdevs, np) &&
 		    of_device_is_available(np)) {
 			err = host1x_subdev_add(device, np);
 			if (err < 0)
@@ -109,17 +110,12 @@ static void host1x_subdev_register(struct host1x_device *device,
 	mutex_unlock(&device->clients_lock);
 	mutex_unlock(&device->subdevs_lock);
 
-	/*
-	 * When all subdevices have been registered, the composite device is
-	 * ready to be probed.
-	 */
 	if (list_empty(&device->subdevs)) {
-		err = device->driver->probe(device);
+		err = device_add(&device->dev);
 		if (err < 0)
-			dev_err(&device->dev, "probe failed for %ps: %d\n",
-				device->driver, err);
+			dev_err(&device->dev, "failed to add: %d\n", err);
 		else
-			device->bound = true;
+			device->registered = true;
 	}
 }
 
@@ -127,18 +123,16 @@ static void __host1x_subdev_unregister(struct host1x_device *device,
 				       struct host1x_subdev *subdev)
 {
 	struct host1x_client *client = subdev->client;
-	int err;
 
 	/*
 	 * If all subdevices have been activated, we're about to remove the
 	 * first active subdevice, so unload the driver first.
 	 */
-	if (list_empty(&device->subdevs) && device->bound) {
-		err = device->driver->remove(device);
-		if (err < 0)
-			dev_err(&device->dev, "remove failed: %d\n", err);
-
-		device->bound = false;
+	if (list_empty(&device->subdevs)) {
+		if (device->registered) {
+			device->registered = false;
+			device_del(&device->dev);
+		}
 	}
 
 	/*
@@ -265,20 +259,60 @@ static int host1x_del_client(struct host1x *host1x,
 	return -ENODEV;
 }
 
-static struct bus_type host1x_bus_type = {
-	.name = "host1x",
-};
+static int host1x_device_match(struct device *dev, struct device_driver *drv)
+{
+	return strcmp(dev_name(dev), drv->name) == 0;
+}
 
-int host1x_bus_init(void)
+static int host1x_device_probe(struct device *dev)
 {
-	return bus_register(&host1x_bus_type);
+	struct host1x_driver *driver = to_host1x_driver(dev->driver);
+	struct host1x_device *device = to_host1x_device(dev);
+
+	if (driver->probe)
+		return driver->probe(device);
+
+	return 0;
 }
 
-void host1x_bus_exit(void)
+static int host1x_device_remove(struct device *dev)
 {
-	bus_unregister(&host1x_bus_type);
+	struct host1x_driver *driver = to_host1x_driver(dev->driver);
+	struct host1x_device *device = to_host1x_device(dev);
+
+	if (driver->remove)
+		return driver->remove(device);
+
+	return 0;
 }
 
+static void host1x_device_shutdown(struct device *dev)
+{
+	struct host1x_driver *driver = to_host1x_driver(dev->driver);
+	struct host1x_device *device = to_host1x_device(dev);
+
+	if (driver->shutdown)
+		driver->shutdown(device);
+}
+
+static const struct dev_pm_ops host1x_device_pm_ops = {
+	.suspend = pm_generic_suspend,
+	.resume = pm_generic_resume,
+	.freeze = pm_generic_freeze,
+	.thaw = pm_generic_thaw,
+	.poweroff = pm_generic_poweroff,
+	.restore = pm_generic_restore,
+};
+
+struct bus_type host1x_bus_type = {
+	.name = "host1x",
+	.match = host1x_device_match,
+	.probe = host1x_device_probe,
+	.remove = host1x_device_remove,
+	.shutdown = host1x_device_shutdown,
+	.pm = &host1x_device_pm_ops,
+};
+
 static void __host1x_device_del(struct host1x_device *device)
 {
 	struct host1x_subdev *subdev, *sd;
@@ -347,6 +381,8 @@ static int host1x_device_add(struct host1x *host1x,
 	if (!device)
 		return -ENOMEM;
 
+	device_initialize(&device->dev);
+
 	mutex_init(&device->subdevs_lock);
 	INIT_LIST_HEAD(&device->subdevs);
 	INIT_LIST_HEAD(&device->active);
@@ -357,18 +393,14 @@ static int host1x_device_add(struct host1x *host1x,
 
 	device->dev.coherent_dma_mask = host1x->dev->coherent_dma_mask;
 	device->dev.dma_mask = &device->dev.coherent_dma_mask;
+	dev_set_name(&device->dev, "%s", driver->driver.name);
 	device->dev.release = host1x_device_release;
-	dev_set_name(&device->dev, "%s", driver->name);
 	device->dev.bus = &host1x_bus_type;
 	device->dev.parent = host1x->dev;
 
-	err = device_register(&device->dev);
-	if (err < 0)
-		return err;
-
-	err = host1x_device_parse_dt(device);
+	err = host1x_device_parse_dt(device, driver);
 	if (err < 0) {
-		device_unregister(&device->dev);
+		kfree(device);
 		return err;
 	}
 
@@ -399,7 +431,12 @@ static int host1x_device_add(struct host1x *host1x,
 static void host1x_device_del(struct host1x *host1x,
 			      struct host1x_device *device)
 {
-	device_unregister(&device->dev);
+	if (device->registered) {
+		device->registered = false;
+		device_del(&device->dev);
+	}
+
+	put_device(&device->dev);
 }
 
 static void host1x_attach_driver(struct host1x *host1x,
@@ -474,7 +511,8 @@ int host1x_unregister(struct host1x *host1x)
 	return 0;
 }
 
-int host1x_driver_register(struct host1x_driver *driver)
+int host1x_driver_register_full(struct host1x_driver *driver,
+				struct module *owner)
 {
 	struct host1x *host1x;
 
@@ -491,9 +529,12 @@ int host1x_driver_register(struct host1x_driver *driver)
 
 	mutex_unlock(&devices_lock);
 
-	return 0;
+	driver->driver.bus = &host1x_bus_type;
+	driver->driver.owner = owner;
+
+	return driver_register(&driver->driver);
 }
-EXPORT_SYMBOL(host1x_driver_register);
+EXPORT_SYMBOL(host1x_driver_register_full);
 
 void host1x_driver_unregister(struct host1x_driver *driver)
 {
diff --git a/drivers/gpu/host1x/bus.h b/drivers/gpu/host1x/bus.h
index 4099e99212c8..88fb1c4aac68 100644
--- a/drivers/gpu/host1x/bus.h
+++ b/drivers/gpu/host1x/bus.h
@@ -18,10 +18,10 @@
 #ifndef HOST1X_BUS_H
 #define HOST1X_BUS_H
 
+struct bus_type;
 struct host1x;
 
-int host1x_bus_init(void);
-void host1x_bus_exit(void);
+extern struct bus_type host1x_bus_type;
 
 int host1x_register(struct host1x *host1x);
 int host1x_unregister(struct host1x *host1x);
diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index 2529908d304b..53d3d1d45b48 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -216,7 +216,7 @@ static int __init tegra_host1x_init(void)
 {
 	int err;
 
-	err = host1x_bus_init();
+	err = bus_register(&host1x_bus_type);
 	if (err < 0)
 		return err;
 
@@ -233,7 +233,7 @@ static int __init tegra_host1x_init(void)
 unregister_host1x:
 	platform_driver_unregister(&tegra_host1x_driver);
 unregister_bus:
-	host1x_bus_exit();
+	bus_unregister(&host1x_bus_type);
 	return err;
 }
 module_init(tegra_host1x_init);
@@ -242,7 +242,7 @@ static void __exit tegra_host1x_exit(void)
 {
 	platform_driver_unregister(&tegra_mipi_driver);
 	platform_driver_unregister(&tegra_host1x_driver);
-	host1x_bus_exit();
+	bus_unregister(&host1x_bus_type);
 }
 module_exit(tegra_host1x_exit);
 
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 7890b553d12e..464f33814a94 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -250,17 +250,29 @@ void host1x_job_unpin(struct host1x_job *job);
 struct host1x_device;
 
 struct host1x_driver {
+	struct device_driver driver;
+
 	const struct of_device_id *subdevs;
 	struct list_head list;
-	const char *name;
 
 	int (*probe)(struct host1x_device *device);
 	int (*remove)(struct host1x_device *device);
+	void (*shutdown)(struct host1x_device *device);
 };
 
-int host1x_driver_register(struct host1x_driver *driver);
+static inline struct host1x_driver *
+to_host1x_driver(struct device_driver *driver)
+{
+	return container_of(driver, struct host1x_driver, driver);
+}
+
+int host1x_driver_register_full(struct host1x_driver *driver,
+				struct module *owner);
 void host1x_driver_unregister(struct host1x_driver *driver);
 
+#define host1x_driver_register(driver) \
+	host1x_driver_register_full(driver, THIS_MODULE)
+
 struct host1x_device {
 	struct host1x_driver *driver;
 	struct list_head list;
@@ -273,7 +285,7 @@ struct host1x_device {
 	struct mutex clients_lock;
 	struct list_head clients;
 
-	bool bound;
+	bool registered;
 };
 
 static inline struct host1x_device *to_host1x_device(struct device *dev)
-- 
cgit v1.2.3


From 72c66644673a61ad85d293de7a61e54b9bdc9682 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Sun, 11 Jan 2015 21:10:07 +0200
Subject: iio: core: Introduce ENERGY channel type

Human activity sensors report the energy burnt by the user.
One of this devices is Freescale's MMA9553L
(http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf)
that computes the number of calories based on weight and step rate.

Introduce a new channel type ENERGY to export these values.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 10 ++++++++++
 drivers/iio/industrialio-core.c         |  1 +
 include/linux/iio/types.h               |  1 +
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 831db8623e4b..33118862fb8b 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -282,6 +282,7 @@ What:		/sys/bus/iio/devices/iio:deviceX/in_current_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_accel_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_accel_peak_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_anglvel_scale
+What:		/sys/bus/iio/devices/iio:deviceX/in_energy_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_magn_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_magn_x_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_magn_y_scale
@@ -1049,6 +1050,15 @@ Description:
 		For a list of available output power modes read
 		in_accel_power_mode_available.
 
+What:		/sys/.../iio:deviceX/in_energy_input
+What:		/sys/.../iio:deviceX/in_energy_raw
+KernelVersion:	3.20
+Contact:	linux-iio@vger.kernel.org
+Description:
+		This attribute is used to read the energy value reported by the
+		device (e.g.: human activity sensors report energy burnt by the
+		user). Units after application of scale are Joules.
+
 What:		/sys/bus/iio/devices/iio:deviceX/store_eeprom
 KernelVersion:	3.4.0
 Contact:	linux-iio@vger.kernel.org
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 69feb912515a..8d2c9ba85fd7 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -72,6 +72,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_HUMIDITYRELATIVE] = "humidityrelative",
 	[IIO_ACTIVITY] = "activity",
 	[IIO_STEPS] = "steps",
+	[IIO_ENERGY] = "energy",
 };
 
 static const char * const iio_modifier_names[] = {
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 904dcbbf0e6f..26b8a1c5e2af 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -32,6 +32,7 @@ enum iio_chan_type {
 	IIO_HUMIDITYRELATIVE,
 	IIO_ACTIVITY,
 	IIO_STEPS,
+	IIO_ENERGY,
 };
 
 enum iio_modifier {
-- 
cgit v1.2.3


From cc3c9eecaed65b26ee0661e9e9491fd8d48e3907 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Sun, 11 Jan 2015 21:10:08 +0200
Subject: iio: core: Introduce DISTANCE channel type

Some devices export an estimation of the distance the user has covered
since the last reset.

One of this devices is Freescale's MMA9553L
(http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf)
that computes the distance based on the stride length and step rate.

Introduce a new channel type DISTANCE to export these values.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 10 ++++++++++
 drivers/iio/industrialio-core.c         |  1 +
 include/linux/iio/types.h               |  1 +
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 33118862fb8b..c627a9a1cd56 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -283,6 +283,7 @@ What:		/sys/bus/iio/devices/iio:deviceX/in_accel_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_accel_peak_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_anglvel_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_energy_scale
+What:		/sys/bus/iio/devices/iio:deviceX/in_distance_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_magn_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_magn_x_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_magn_y_scale
@@ -1059,6 +1060,15 @@ Description:
 		device (e.g.: human activity sensors report energy burnt by the
 		user). Units after application of scale are Joules.
 
+What:		/sys/.../iio:deviceX/in_distance_input
+What:		/sys/.../iio:deviceX/in_distance_raw
+KernelVersion:	3.20
+Contact:	linux-iio@vger.kernel.org
+Description:
+		This attribute is used to read the distance covered by the user
+		since the last reboot while activated. Units after application
+		of scale are meters.
+
 What:		/sys/bus/iio/devices/iio:deviceX/store_eeprom
 KernelVersion:	3.4.0
 Contact:	linux-iio@vger.kernel.org
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 8d2c9ba85fd7..655755b49ccd 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -73,6 +73,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_ACTIVITY] = "activity",
 	[IIO_STEPS] = "steps",
 	[IIO_ENERGY] = "energy",
+	[IIO_DISTANCE] = "distance",
 };
 
 static const char * const iio_modifier_names[] = {
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 26b8a1c5e2af..a7de445222f4 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -33,6 +33,7 @@ enum iio_chan_type {
 	IIO_ACTIVITY,
 	IIO_STEPS,
 	IIO_ENERGY,
+	IIO_DISTANCE,
 };
 
 enum iio_modifier {
-- 
cgit v1.2.3


From 5a1a932981415661827f7edd9e99943a2a3b7b67 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Sun, 11 Jan 2015 21:10:09 +0200
Subject: iio: core: Introduce IIO_VELOCITY and IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z

Some devices export the current speed value of the user.

One of this devices is Freescale's MMA9553L
(http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf)
that computes the speed of the user based on the number of steps and
stride length.

Introduce a new channel type VELOCITY and a modifier for the magniture or
norm of the velocity vector, IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 10 ++++++++++
 drivers/iio/industrialio-core.c         |  2 ++
 include/linux/iio/types.h               |  2 ++
 3 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index c627a9a1cd56..80b5efb1cdbf 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -295,6 +295,7 @@ What:		/sys/bus/iio/devices/iio:deviceX/in_rot_from_north_true_tilt_comp_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_pressureY_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_pressure_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_humidityrelative_scale
+What:		/sys/bus/iio/devices/iio:deviceX/in_velocity_sqrt(x^2+y^2+z^2)_scale
 KernelVersion:	2.6.35
 Contact:	linux-iio@vger.kernel.org
 Description:
@@ -1164,3 +1165,12 @@ Contact:	linux-iio@vger.kernel.org
 Description:
 		This attribute is used to read the number of steps taken by the user
 		since the last reboot while activated.
+
+What:		/sys/.../iio:deviceX/in_velocity_sqrt(x^2+y^2+z^2)_input
+What:		/sys/.../iio:deviceX/in_velocity_sqrt(x^2+y^2+z^2)_raw
+KernelVersion:	3.19
+Contact:	linux-iio@vger.kernel.org
+Description:
+		This attribute is used to read the current speed value of the
+		user (which is the norm or magnitude of the velocity vector).
+		Units after application of scale are m/s.
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 655755b49ccd..18a8ab911aab 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -74,6 +74,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_STEPS] = "steps",
 	[IIO_ENERGY] = "energy",
 	[IIO_DISTANCE] = "distance",
+	[IIO_VELOCITY] = "velocity",
 };
 
 static const char * const iio_modifier_names[] = {
@@ -99,6 +100,7 @@ static const char * const iio_modifier_names[] = {
 	[IIO_MOD_JOGGING] = "jogging",
 	[IIO_MOD_WALKING] = "walking",
 	[IIO_MOD_STILL] = "still",
+	[IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z] = "sqrt(x^2+y^2+z^2)",
 };
 
 /* relies on pairs of these shared then separate */
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index a7de445222f4..c3601c2c0a9d 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -34,6 +34,7 @@ enum iio_chan_type {
 	IIO_STEPS,
 	IIO_ENERGY,
 	IIO_DISTANCE,
+	IIO_VELOCITY,
 };
 
 enum iio_modifier {
@@ -68,6 +69,7 @@ enum iio_modifier {
 	IIO_MOD_JOGGING,
 	IIO_MOD_WALKING,
 	IIO_MOD_STILL,
+	IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z,
 };
 
 enum iio_event_type {
-- 
cgit v1.2.3


From d37f6836fa285882450e28d1cbc5a9b624911e7e Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Sun, 11 Jan 2015 21:10:10 +0200
Subject: iio: core: Introduce IIO_CHAN_INFO_CALIBWEIGHT

Some devices need the weight of the user to compute other
parameters. One of this devices is Freescale's MMA9553L
(http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf)
that needs the weight of the user to compute the number of calories burnt.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 7 +++++++
 drivers/iio/industrialio-core.c         | 1 +
 include/linux/iio/iio.h                 | 1 +
 3 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 80b5efb1cdbf..71dc8db4388b 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -351,6 +351,13 @@ Description:
 		to compute the stride length, distance, speed and activity
 		type.
 
+What:		/sys/bus/iio/devices/iio:deviceX/in_energy_calibweight
+KernelVersion:	3.20
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Weight of the user (in kg). It is needed by some pedometers
+		to compute the calories burnt by the user.
+
 What:		/sys/bus/iio/devices/iio:deviceX/in_accel_scale_available
 What:		/sys/.../iio:deviceX/in_voltageX_scale_available
 What:		/sys/.../iio:deviceX/in_voltage-voltage_scale_available
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 18a8ab911aab..4ee6fdfa92fe 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -125,6 +125,7 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_INT_TIME] = "integration_time",
 	[IIO_CHAN_INFO_ENABLE] = "en",
 	[IIO_CHAN_INFO_CALIBHEIGHT] = "calibheight",
+	[IIO_CHAN_INFO_CALIBWEIGHT] = "calibweight",
 };
 
 /**
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 590202024857..51f16437dacc 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -40,6 +40,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_INT_TIME,
 	IIO_CHAN_INFO_ENABLE,
 	IIO_CHAN_INFO_CALIBHEIGHT,
+	IIO_CHAN_INFO_CALIBWEIGHT,
 };
 
 enum iio_shared_by {
-- 
cgit v1.2.3


From 27be84236d75c13a83c45d850390f40b58401d97 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Sun, 11 Jan 2015 21:10:11 +0200
Subject: iio: core: Introduce CHANGE event type

A step detector will generate an interrupt each time N step are detected.
A device that has such pedometer functionality is Freescale's MMA9553L:
http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf.

Introduce IIO_EV_TYPE_CHANGE event type for events that are generated
when the channel passes a threshold on the absolute change in value.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio              | 20 ++++++++++++++++----
 drivers/iio/industrialio-event.c                     |  1 +
 .../staging/iio/Documentation/iio_event_monitor.c    |  2 ++
 include/linux/iio/types.h                            |  1 +
 4 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 71dc8db4388b..c03a1401b9ca 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -891,12 +891,24 @@ Description:
 		number or direction is not specified, applies to all channels of
 		this type.
 
-What:		/sys/.../events/in_steps_instance_en
-KernelVersion:	3.19
+What:		/sys/.../events/in_steps_change_en
+KernelVersion:	3.20
 Contact:	linux-iio@vger.kernel.org
 Description:
-		Enables or disables step detection. Each time the user takes a step an
-		event of this type will be generated.
+		Event generated when channel passes a threshold on the absolute
+		change in value. E.g. for steps: a step change event is
+		generated each time the user takes N steps, where N is set using
+		in_steps_change_value.
+
+What:		/sys/.../events/in_steps_change_value
+KernelVersion:	3.20
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Specifies the value of change threshold that the
+		device is comparing against for the events enabled by
+		<type>[Y][_name]_roc[_rising|falling|]_en. E.g. for steps:
+		if set to 3, a step change event will be generated every 3
+		steps.
 
 What:		/sys/bus/iio/devices/iio:deviceX/trigger/current_trigger
 KernelVersion:	2.6.35
diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index 78cf115dc0d4..b33ce55eb695 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -198,6 +198,7 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_THRESH_ADAPTIVE] = "thresh_adaptive",
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
 	[IIO_EV_TYPE_INSTANCE] = "instance",
+	[IIO_EV_TYPE_CHANGE] = "change",
 };
 
 static const char * const iio_ev_dir_text[] = {
diff --git a/drivers/staging/iio/Documentation/iio_event_monitor.c b/drivers/staging/iio/Documentation/iio_event_monitor.c
index def236abcb3e..2e78d58b9b77 100644
--- a/drivers/staging/iio/Documentation/iio_event_monitor.c
+++ b/drivers/staging/iio/Documentation/iio_event_monitor.c
@@ -60,6 +60,7 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_THRESH_ADAPTIVE] = "thresh_adaptive",
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
 	[IIO_EV_TYPE_INSTANCE] = "instance",
+	[IIO_EV_TYPE_CHANGE] = "change",
 };
 
 static const char * const iio_ev_dir_text[] = {
@@ -179,6 +180,7 @@ static bool event_is_known(struct iio_event_data *event)
 	case IIO_EV_TYPE_THRESH_ADAPTIVE:
 	case IIO_EV_TYPE_MAG_ADAPTIVE:
 	case IIO_EV_TYPE_INSTANCE:
+	case IIO_EV_TYPE_CHANGE:
 		break;
 	default:
 		return false;
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index c3601c2c0a9d..3ba3d6678412 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -79,6 +79,7 @@ enum iio_event_type {
 	IIO_EV_TYPE_THRESH_ADAPTIVE,
 	IIO_EV_TYPE_MAG_ADAPTIVE,
 	IIO_EV_TYPE_INSTANCE,
+	IIO_EV_TYPE_CHANGE,
 };
 
 enum iio_event_info {
-- 
cgit v1.2.3


From 17a2cbc27981b85a09a48425c2614ae0cb7be8cd Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Sun, 11 Jan 2015 21:10:12 +0200
Subject: iio: core: Remove IIO_EV_TYPE_INSTANCE

By introducing IIO_EV_TYPE_CHANGE, IIO_EV_TYPE_INSTANCE becomes redundant.
The effect of IIO_EV_TYPE_INSTANCE can be obtained by using IIO_EV_TYPE_CHANGE
with IIO_EV_INFO_VALUE set to 1.

Remove all instances of IIO_EV_TYPE_INSTANCE and replace them with
IIO_EV_TYPE_CHANGE where needed.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-event.c                      | 1 -
 drivers/staging/iio/Documentation/iio_event_monitor.c | 2 --
 drivers/staging/iio/iio_simple_dummy.c                | 2 +-
 drivers/staging/iio/iio_simple_dummy_events.c         | 4 ++--
 include/linux/iio/types.h                             | 1 -
 5 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index b33ce55eb695..a4b397048f71 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -197,7 +197,6 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_ROC] = "roc",
 	[IIO_EV_TYPE_THRESH_ADAPTIVE] = "thresh_adaptive",
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
-	[IIO_EV_TYPE_INSTANCE] = "instance",
 	[IIO_EV_TYPE_CHANGE] = "change",
 };
 
diff --git a/drivers/staging/iio/Documentation/iio_event_monitor.c b/drivers/staging/iio/Documentation/iio_event_monitor.c
index 2e78d58b9b77..72c96aa6e992 100644
--- a/drivers/staging/iio/Documentation/iio_event_monitor.c
+++ b/drivers/staging/iio/Documentation/iio_event_monitor.c
@@ -59,7 +59,6 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_ROC] = "roc",
 	[IIO_EV_TYPE_THRESH_ADAPTIVE] = "thresh_adaptive",
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
-	[IIO_EV_TYPE_INSTANCE] = "instance",
 	[IIO_EV_TYPE_CHANGE] = "change",
 };
 
@@ -179,7 +178,6 @@ static bool event_is_known(struct iio_event_data *event)
 	case IIO_EV_TYPE_ROC:
 	case IIO_EV_TYPE_THRESH_ADAPTIVE:
 	case IIO_EV_TYPE_MAG_ADAPTIVE:
-	case IIO_EV_TYPE_INSTANCE:
 	case IIO_EV_TYPE_CHANGE:
 		break;
 	default:
diff --git a/drivers/staging/iio/iio_simple_dummy.c b/drivers/staging/iio/iio_simple_dummy.c
index 0b8611ac1003..e4520213f627 100644
--- a/drivers/staging/iio/iio_simple_dummy.c
+++ b/drivers/staging/iio/iio_simple_dummy.c
@@ -73,7 +73,7 @@ static const struct iio_event_spec iio_dummy_event = {
  * simple step detect event - triggered when a step is detected
  */
 static const struct iio_event_spec step_detect_event = {
-	.type = IIO_EV_TYPE_INSTANCE,
+	.type = IIO_EV_TYPE_CHANGE,
 	.dir = IIO_EV_DIR_NONE,
 	.mask_separate = BIT(IIO_EV_INFO_ENABLE),
 };
diff --git a/drivers/staging/iio/iio_simple_dummy_events.c b/drivers/staging/iio/iio_simple_dummy_events.c
index ac15a44ba271..a5cd3bb219fe 100644
--- a/drivers/staging/iio/iio_simple_dummy_events.c
+++ b/drivers/staging/iio/iio_simple_dummy_events.c
@@ -86,7 +86,7 @@ int iio_simple_dummy_write_event_config(struct iio_dev *indio_dev,
 		}
 	case IIO_STEPS:
 		switch (type) {
-		case IIO_EV_TYPE_INSTANCE:
+		case IIO_EV_TYPE_CHANGE:
 			st->event_en = state;
 			break;
 		default:
@@ -201,7 +201,7 @@ static irqreturn_t iio_simple_dummy_event_handler(int irq, void *private)
 		iio_push_event(indio_dev,
 			       IIO_EVENT_CODE(IIO_STEPS, 0, IIO_NO_MOD,
 					      IIO_EV_DIR_NONE,
-					      IIO_EV_TYPE_INSTANCE, 0, 0, 0),
+					      IIO_EV_TYPE_CHANGE, 0, 0, 0),
 			       iio_get_time_ns());
 		break;
 	default:
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 3ba3d6678412..580ed5bdb3fa 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -78,7 +78,6 @@ enum iio_event_type {
 	IIO_EV_TYPE_ROC,
 	IIO_EV_TYPE_THRESH_ADAPTIVE,
 	IIO_EV_TYPE_MAG_ADAPTIVE,
-	IIO_EV_TYPE_INSTANCE,
 	IIO_EV_TYPE_CHANGE,
 };
 
-- 
cgit v1.2.3


From fafdd2bf2638157670f28462b641150d16dbaeca Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Mon, 24 Nov 2014 22:30:09 +0100
Subject: UBI: Implement UBI_METAONLY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

UBI_METAONLY is a new open mode for UBI volumes, it indicates
that only meta data is being changed.
Meta data in terms of UBI volumes means data which is stored in the
UBI volume table but not on the volume itself.
While it does not interfere with UBI_READONLY and UBI_READWRITE
it is not allowed to use UBI_METAONLY together with UBI_EXCLUSIVE.

Cc: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
Cc: Andrew Murray <amurray@embedded-bits.co.uk>
Signed-off-by: Richard Weinberger <richard@nod.at>
Tested-by: Guido Martínez <guido@vanguardiasur.com.ar>
Reviewed-by: Guido Martínez <guido@vanguardiasur.com.ar>
Tested-by: Christoph Fritz <chf.fritz@googlemail.com>
Tested-by: Andrew Murray <amurray@embedded-bits.co.uk>
---
 drivers/mtd/ubi/cdev.c  |  8 +++++---
 drivers/mtd/ubi/kapi.c  | 15 +++++++++++++--
 drivers/mtd/ubi/ubi.h   |  8 ++++++--
 include/linux/mtd/ubi.h |  5 ++++-
 4 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
index 3410ea8109f8..f5c715c32dcd 100644
--- a/drivers/mtd/ubi/cdev.c
+++ b/drivers/mtd/ubi/cdev.c
@@ -61,13 +61,13 @@ static int get_exclusive(struct ubi_device *ubi, struct ubi_volume_desc *desc)
 	struct ubi_volume *vol = desc->vol;
 
 	spin_lock(&vol->ubi->volumes_lock);
-	users = vol->readers + vol->writers + vol->exclusive;
+	users = vol->readers + vol->writers + vol->exclusive + vol->metaonly;
 	ubi_assert(users > 0);
 	if (users > 1) {
 		ubi_err(ubi, "%d users for volume %d", users, vol->vol_id);
 		err = -EBUSY;
 	} else {
-		vol->readers = vol->writers = 0;
+		vol->readers = vol->writers = vol->metaonly = 0;
 		vol->exclusive = 1;
 		err = desc->mode;
 		desc->mode = UBI_EXCLUSIVE;
@@ -87,13 +87,15 @@ static void revoke_exclusive(struct ubi_volume_desc *desc, int mode)
 	struct ubi_volume *vol = desc->vol;
 
 	spin_lock(&vol->ubi->volumes_lock);
-	ubi_assert(vol->readers == 0 && vol->writers == 0);
+	ubi_assert(vol->readers == 0 && vol->writers == 0 && vol->metaonly == 0);
 	ubi_assert(vol->exclusive == 1 && desc->mode == UBI_EXCLUSIVE);
 	vol->exclusive = 0;
 	if (mode == UBI_READONLY)
 		vol->readers = 1;
 	else if (mode == UBI_READWRITE)
 		vol->writers = 1;
+	else if (mode == UBI_METAONLY)
+		vol->metaonly = 1;
 	else
 		vol->exclusive = 1;
 	spin_unlock(&vol->ubi->volumes_lock);
diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
index f3bab669f6bb..589c423fac2d 100644
--- a/drivers/mtd/ubi/kapi.c
+++ b/drivers/mtd/ubi/kapi.c
@@ -137,7 +137,7 @@ struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode)
 		return ERR_PTR(-EINVAL);
 
 	if (mode != UBI_READONLY && mode != UBI_READWRITE &&
-	    mode != UBI_EXCLUSIVE)
+	    mode != UBI_EXCLUSIVE && mode != UBI_METAONLY)
 		return ERR_PTR(-EINVAL);
 
 	/*
@@ -182,10 +182,17 @@ struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode)
 		break;
 
 	case UBI_EXCLUSIVE:
-		if (vol->exclusive || vol->writers || vol->readers)
+		if (vol->exclusive || vol->writers || vol->readers ||
+		    vol->metaonly)
 			goto out_unlock;
 		vol->exclusive = 1;
 		break;
+
+	case UBI_METAONLY:
+		if (vol->metaonly || vol->exclusive)
+			goto out_unlock;
+		vol->metaonly = 1;
+		break;
 	}
 	get_device(&vol->dev);
 	vol->ref_count += 1;
@@ -343,6 +350,10 @@ void ubi_close_volume(struct ubi_volume_desc *desc)
 		break;
 	case UBI_EXCLUSIVE:
 		vol->exclusive = 0;
+		break;
+	case UBI_METAONLY:
+		vol->metaonly = 0;
+		break;
 	}
 	vol->ref_count -= 1;
 	spin_unlock(&ubi->volumes_lock);
diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h
index f80ffaba9058..20cabc060b61 100644
--- a/drivers/mtd/ubi/ubi.h
+++ b/drivers/mtd/ubi/ubi.h
@@ -261,6 +261,7 @@ struct ubi_fm_pool {
  * @readers: number of users holding this volume in read-only mode
  * @writers: number of users holding this volume in read-write mode
  * @exclusive: whether somebody holds this volume in exclusive mode
+ * @metaonly: whether somebody is altering only meta data of this volume
  *
  * @reserved_pebs: how many physical eraseblocks are reserved for this volume
  * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME)
@@ -309,6 +310,7 @@ struct ubi_volume {
 	int readers;
 	int writers;
 	int exclusive;
+	int metaonly;
 
 	int reserved_pebs;
 	int vol_type;
@@ -339,7 +341,8 @@ struct ubi_volume {
 /**
  * struct ubi_volume_desc - UBI volume descriptor returned when it is opened.
  * @vol: reference to the corresponding volume description object
- * @mode: open mode (%UBI_READONLY, %UBI_READWRITE, or %UBI_EXCLUSIVE)
+ * @mode: open mode (%UBI_READONLY, %UBI_READWRITE, %UBI_EXCLUSIVE
+ * or %UBI_METAONLY)
  */
 struct ubi_volume_desc {
 	struct ubi_volume *vol;
@@ -390,7 +393,8 @@ struct ubi_debug_info {
  * @volumes_lock: protects @volumes, @rsvd_pebs, @avail_pebs, beb_rsvd_pebs,
  *                @beb_rsvd_level, @bad_peb_count, @good_peb_count, @vol_count,
  *                @vol->readers, @vol->writers, @vol->exclusive,
- *                @vol->ref_count, @vol->mapping and @vol->eba_tbl.
+ *                @vol->metaonly, @vol->ref_count, @vol->mapping and
+ *                @vol->eba_tbl.
  * @ref_count: count of references on the UBI device
  * @image_seq: image sequence number recorded on EC headers
  *
diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h
index c3918a0684fe..8fa2753316e8 100644
--- a/include/linux/mtd/ubi.h
+++ b/include/linux/mtd/ubi.h
@@ -34,11 +34,14 @@
  * UBI_READONLY: read-only mode
  * UBI_READWRITE: read-write mode
  * UBI_EXCLUSIVE: exclusive mode
+ * UBI_METAONLY: modify only the volume meta-data,
+ *  i.e. the data stored in the volume table, but not in any of volume LEBs.
  */
 enum {
 	UBI_READONLY = 1,
 	UBI_READWRITE,
-	UBI_EXCLUSIVE
+	UBI_EXCLUSIVE,
+	UBI_METAONLY
 };
 
 /**
-- 
cgit v1.2.3


From 9ff08979e17423f0f691c1d76f35dfec72a5e459 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Sat, 10 Jan 2015 22:52:13 +0100
Subject: UBI: Add initial support for scatter gather

Adds a new set of functions to deal with scatter gather.
ubi_eba_read_leb_sg() will read from a LEB into a scatter gather list.
The new data structure struct ubi_sgl will be used within UBI to
hold the scatter gather list itself and metadata to have a cursor
within the list.

Signed-off-by: Richard Weinberger <richard@nod.at>
Tested-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
---
 drivers/mtd/ubi/eba.c   | 55 ++++++++++++++++++++++++++++
 drivers/mtd/ubi/kapi.c  | 95 +++++++++++++++++++++++++++++++++++++++++--------
 drivers/mtd/ubi/ubi.h   |  3 ++
 include/linux/mtd/ubi.h | 48 +++++++++++++++++++++++++
 4 files changed, 186 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c
index a40020cf0923..f1db73da7975 100644
--- a/drivers/mtd/ubi/eba.c
+++ b/drivers/mtd/ubi/eba.c
@@ -479,6 +479,61 @@ out_unlock:
 	return err;
 }
 
+/**
+ * ubi_eba_read_leb_sg - read data into a scatter gather list.
+ * @ubi: UBI device description object
+ * @vol: volume description object
+ * @lnum: logical eraseblock number
+ * @sgl: UBI scatter gather list to store the read data
+ * @offset: offset from where to read
+ * @len: how many bytes to read
+ * @check: data CRC check flag
+ *
+ * This function works exactly like ubi_eba_read_leb(). But instead of
+ * storing the read data into a buffer it writes to an UBI scatter gather
+ * list.
+ */
+int ubi_eba_read_leb_sg(struct ubi_device *ubi, struct ubi_volume *vol,
+			struct ubi_sgl *sgl, int lnum, int offset, int len,
+			int check)
+{
+	int to_read;
+	int ret;
+	struct scatterlist *sg;
+
+	for (;;) {
+		ubi_assert(sgl->list_pos < UBI_MAX_SG_COUNT);
+		sg = &sgl->sg[sgl->list_pos];
+		if (len < sg->length - sgl->page_pos)
+			to_read = len;
+		else
+			to_read = sg->length - sgl->page_pos;
+
+		ret = ubi_eba_read_leb(ubi, vol, lnum,
+				       sg_virt(sg) + sgl->page_pos, offset,
+				       to_read, check);
+		if (ret < 0)
+			return ret;
+
+		offset += to_read;
+		len -= to_read;
+		if (!len) {
+			sgl->page_pos += to_read;
+			if (sgl->page_pos == sg->length) {
+				sgl->list_pos++;
+				sgl->page_pos = 0;
+			}
+
+			break;
+		}
+
+		sgl->list_pos++;
+		sgl->page_pos = 0;
+	}
+
+	return ret;
+}
+
 /**
  * recover_peb - recover from write failure.
  * @ubi: UBI device description object
diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
index 589c423fac2d..478e00cf2d9e 100644
--- a/drivers/mtd/ubi/kapi.c
+++ b/drivers/mtd/ubi/kapi.c
@@ -365,6 +365,43 @@ void ubi_close_volume(struct ubi_volume_desc *desc)
 }
 EXPORT_SYMBOL_GPL(ubi_close_volume);
 
+/**
+ * leb_read_sanity_check - does sanity checks on read requests.
+ * @desc: volume descriptor
+ * @lnum: logical eraseblock number to read from
+ * @offset: offset within the logical eraseblock to read from
+ * @len: how many bytes to read
+ *
+ * This function is used by ubi_leb_read() and ubi_leb_read_sg()
+ * to perform sanity checks.
+ */
+static int leb_read_sanity_check(struct ubi_volume_desc *desc, int lnum,
+				 int offset, int len)
+{
+	struct ubi_volume *vol = desc->vol;
+	struct ubi_device *ubi = vol->ubi;
+	int vol_id = vol->vol_id;
+
+	if (vol_id < 0 || vol_id >= ubi->vtbl_slots || lnum < 0 ||
+	    lnum >= vol->used_ebs || offset < 0 || len < 0 ||
+	    offset + len > vol->usable_leb_size)
+		return -EINVAL;
+
+	if (vol->vol_type == UBI_STATIC_VOLUME) {
+		if (vol->used_ebs == 0)
+			/* Empty static UBI volume */
+			return 0;
+		if (lnum == vol->used_ebs - 1 &&
+		    offset + len > vol->last_eb_bytes)
+			return -EINVAL;
+	}
+
+	if (vol->upd_marker)
+		return -EBADF;
+
+	return 0;
+}
+
 /**
  * ubi_leb_read - read data.
  * @desc: volume descriptor
@@ -401,22 +438,10 @@ int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
 
 	dbg_gen("read %d bytes from LEB %d:%d:%d", len, vol_id, lnum, offset);
 
-	if (vol_id < 0 || vol_id >= ubi->vtbl_slots || lnum < 0 ||
-	    lnum >= vol->used_ebs || offset < 0 || len < 0 ||
-	    offset + len > vol->usable_leb_size)
-		return -EINVAL;
-
-	if (vol->vol_type == UBI_STATIC_VOLUME) {
-		if (vol->used_ebs == 0)
-			/* Empty static UBI volume */
-			return 0;
-		if (lnum == vol->used_ebs - 1 &&
-		    offset + len > vol->last_eb_bytes)
-			return -EINVAL;
-	}
+	err = leb_read_sanity_check(desc, lnum, offset, len);
+	if (err < 0)
+		return err;
 
-	if (vol->upd_marker)
-		return -EBADF;
 	if (len == 0)
 		return 0;
 
@@ -430,6 +455,46 @@ int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
 }
 EXPORT_SYMBOL_GPL(ubi_leb_read);
 
+
+/**
+ * ubi_leb_read_sg - read data into a scatter gather list.
+ * @desc: volume descriptor
+ * @lnum: logical eraseblock number to read from
+ * @buf: buffer where to store the read data
+ * @offset: offset within the logical eraseblock to read from
+ * @len: how many bytes to read
+ * @check: whether UBI has to check the read data's CRC or not.
+ *
+ * This function works exactly like ubi_leb_read_sg(). But instead of
+ * storing the read data into a buffer it writes to an UBI scatter gather
+ * list.
+ */
+int ubi_leb_read_sg(struct ubi_volume_desc *desc, int lnum, struct ubi_sgl *sgl,
+		    int offset, int len, int check)
+{
+	struct ubi_volume *vol = desc->vol;
+	struct ubi_device *ubi = vol->ubi;
+	int err, vol_id = vol->vol_id;
+
+	dbg_gen("read %d bytes from LEB %d:%d:%d", len, vol_id, lnum, offset);
+
+	err = leb_read_sanity_check(desc, lnum, offset, len);
+	if (err < 0)
+		return err;
+
+	if (len == 0)
+		return 0;
+
+	err = ubi_eba_read_leb_sg(ubi, vol, sgl, lnum, offset, len, check);
+	if (err && mtd_is_eccerr(err) && vol->vol_type == UBI_STATIC_VOLUME) {
+		ubi_warn(ubi, "mark volume %d as corrupted", vol_id);
+		vol->corrupted = 1;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(ubi_leb_read_sg);
+
 /**
  * ubi_leb_write - write data.
  * @desc: volume descriptor
diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h
index 20cabc060b61..d94b81fa2306 100644
--- a/drivers/mtd/ubi/ubi.h
+++ b/drivers/mtd/ubi/ubi.h
@@ -795,6 +795,9 @@ int ubi_eba_unmap_leb(struct ubi_device *ubi, struct ubi_volume *vol,
 		      int lnum);
 int ubi_eba_read_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum,
 		     void *buf, int offset, int len, int check);
+int ubi_eba_read_leb_sg(struct ubi_device *ubi, struct ubi_volume *vol,
+			struct ubi_sgl *sgl, int lnum, int offset, int len,
+			int check);
 int ubi_eba_write_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum,
 		      const void *buf, int offset, int len);
 int ubi_eba_write_leb_st(struct ubi_device *ubi, struct ubi_volume *vol,
diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h
index 8fa2753316e8..1e271cb559cd 100644
--- a/include/linux/mtd/ubi.h
+++ b/include/linux/mtd/ubi.h
@@ -23,11 +23,18 @@
 
 #include <linux/ioctl.h>
 #include <linux/types.h>
+#include <linux/scatterlist.h>
 #include <mtd/ubi-user.h>
 
 /* All voumes/LEBs */
 #define UBI_ALL -1
 
+/*
+ * Maximum number of scatter gather list entries,
+ * we use only 64 to have a lower memory foot print.
+ */
+#define UBI_MAX_SG_COUNT 64
+
 /*
  * enum ubi_open_mode - UBI volume open mode constants.
  *
@@ -118,6 +125,35 @@ struct ubi_volume_info {
 	dev_t cdev;
 };
 
+/**
+ * struct ubi_sgl - UBI scatter gather list data structure.
+ * @list_pos: current position in @sg[]
+ * @page_pos: current position in @sg[@list_pos]
+ * @sg: the scatter gather list itself
+ *
+ * ubi_sgl is a wrapper around a scatter list which keeps track of the
+ * current position in the list and the current list item such that
+ * it can be used across multiple ubi_leb_read_sg() calls.
+ */
+struct ubi_sgl {
+	int list_pos;
+	int page_pos;
+	struct scatterlist sg[UBI_MAX_SG_COUNT];
+};
+
+/**
+ * ubi_sgl_init - initialize an UBI scatter gather list data structure.
+ * @usgl: the UBI scatter gather struct itself
+ *
+ * Please note that you still have to use sg_init_table() or any adequate
+ * function to initialize the unterlaying struct scatterlist.
+ */
+static inline void ubi_sgl_init(struct ubi_sgl *usgl)
+{
+	usgl->list_pos = 0;
+	usgl->page_pos = 0;
+}
+
 /**
  * struct ubi_device_info - UBI device description data structure.
  * @ubi_num: ubi device number
@@ -213,6 +249,8 @@ int ubi_unregister_volume_notifier(struct notifier_block *nb);
 void ubi_close_volume(struct ubi_volume_desc *desc);
 int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
 		 int len, int check);
+int ubi_leb_read_sg(struct ubi_volume_desc *desc, int lnum, struct ubi_sgl *sgl,
+		   int offset, int len, int check);
 int ubi_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
 		  int offset, int len);
 int ubi_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
@@ -233,4 +271,14 @@ static inline int ubi_read(struct ubi_volume_desc *desc, int lnum, char *buf,
 {
 	return ubi_leb_read(desc, lnum, buf, offset, len, 0);
 }
+
+/*
+ * This function is the same as the 'ubi_leb_read_sg()' function, but it does
+ * not provide the checking capability.
+ */
+static inline int ubi_read_sg(struct ubi_volume_desc *desc, int lnum,
+			      struct ubi_sgl *sgl, int offset, int len)
+{
+	return ubi_leb_read_sg(desc, lnum, sgl, offset, len, 0);
+}
 #endif /* !__LINUX_UBI_H__ */
-- 
cgit v1.2.3


From 6ea22486ba46bcb665de36514094d74575cd1330 Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Wed, 28 Jan 2015 12:48:53 +0000
Subject: tracing: Add array printing helper

If a trace event contains an array, there is currently no standard
way to format this for text output.  Drivers are currently hacking
around this by a) local hacks that use the trace_seq functionailty
directly, or b) just not printing that information.  For fixed size
arrays, formatting of the elements can be open-coded, but this gets
cumbersome for arrays of non-trivial size.

These approaches result in non-standard content of the event format
description delivered to userspace, so userland tools needs to be
taught to understand and parse each array printing method
individually.

This patch implements a __print_array() helper that tracepoint
implementations can use instead of reinventing it.  A simple C-style
syntax is used to delimit the array and its elements {like,this}.

So that the helpers can be used with large static arrays as well as
dynamic arrays, they take a pointer and element count: they can be
used with __get_dynamic_array() for use with dynamic arrays.
Link: http://lkml.kernel.org/r/1422449335-8289-2-git-send-email-javi.merino@arm.com

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Signed-off-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  4 ++++
 include/trace/ftrace.h       |  9 +++++++++
 kernel/trace/trace_output.c  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c348b8..5aa4a9269547 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -44,6 +44,10 @@ const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
 const char *ftrace_print_hex_seq(struct trace_seq *p,
 				 const unsigned char *buf, int len);
 
+const char *ftrace_print_array_seq(struct trace_seq *p,
+				   const void *buf, int buf_len,
+				   size_t el_size);
+
 struct trace_iterator;
 struct trace_event;
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 139b5067345b..304901fc5f34 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -263,6 +263,14 @@
 #undef __print_hex
 #define __print_hex(buf, buf_len) ftrace_print_hex_seq(p, buf, buf_len)
 
+#undef __print_array
+#define __print_array(array, count, el_size)				\
+	({								\
+		BUILD_BUG_ON(el_size != 1 && el_size != 2 &&		\
+			     el_size != 4 && el_size != 8);		\
+		ftrace_print_array_seq(p, array, count, el_size);	\
+	})
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static notrace enum print_line_t					\
@@ -674,6 +682,7 @@ static inline void ftrace_test_probe_##call(void)			\
 #undef __get_dynamic_array_len
 #undef __get_str
 #undef __get_bitmask
+#undef __print_array
 
 #undef TP_printk
 #define TP_printk(fmt, args...) "\"" fmt "\", "  __stringify(args)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b77b9a697619..692bf7184c8c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -177,6 +177,50 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 }
 EXPORT_SYMBOL(ftrace_print_hex_seq);
 
+const char *
+ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
+		       size_t el_size)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	const char *prefix = "";
+	void *ptr = (void *)buf;
+
+	trace_seq_putc(p, '{');
+
+	while (ptr < buf + buf_len) {
+		switch (el_size) {
+		case 1:
+			trace_seq_printf(p, "%s0x%x", prefix,
+					 *(u8 *)ptr);
+			break;
+		case 2:
+			trace_seq_printf(p, "%s0x%x", prefix,
+					 *(u16 *)ptr);
+			break;
+		case 4:
+			trace_seq_printf(p, "%s0x%x", prefix,
+					 *(u32 *)ptr);
+			break;
+		case 8:
+			trace_seq_printf(p, "%s0x%llx", prefix,
+					 *(u64 *)ptr);
+			break;
+		default:
+			trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size,
+					 *(u8 *)ptr);
+			el_size = 1;
+		}
+		prefix = ",";
+		ptr += el_size;
+	}
+
+	trace_seq_putc(p, '}');
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+EXPORT_SYMBOL(ftrace_print_array_seq);
+
 int ftrace_raw_output_prep(struct trace_iterator *iter,
 			   struct trace_event *trace_event)
 {
-- 
cgit v1.2.3


From ad9cf3bbd18a94806314741ac8092c3422f5aebe Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 16 Dec 2014 12:54:25 -0500
Subject: block: mark blk-mq devices as stackable

Commit 4ee5eaf4 ("block: add a queue flag for request stacking support")
introduced the concept of "STACKABLE" and blk-mq devices fit the
definition in that they establish q->request_fn.  So establish
QUEUE_FLAG_STACKABLE in QUEUE_FLAG_MQ_DEFAULT.

While not strictly needed (DM _could_ just check for q->mq_ops to assume
the device is request-based), request-based DM support for blk-mq devices
benefits from the ability to consistently check for QUEUE_FLAG_STACKABLE
before allowing a device to be stacked into a request-based DM table.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6f388fd1c11c..13e16401a7ce 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -520,6 +520,7 @@ struct request_queue {
 				 (1 << QUEUE_FLAG_ADD_RANDOM))
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
+				 (1 << QUEUE_FLAG_STACKABLE)	|	\
 				 (1 << QUEUE_FLAG_SAME_COMP))
 
 static inline void queue_lockdep_assert_held(struct request_queue *q)
-- 
cgit v1.2.3


From 05afcb77eb4713f46e7ebaa3cb54bc465c5d516e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 23 Jan 2015 01:08:07 -0500
Subject: new helper: iov_iter_bvec()

similar to iov_iter_kvec(), for ITER_BVEC ones

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/splice.c         |  7 ++-----
 include/linux/uio.h |  4 +++-
 mm/iov_iter.c       | 17 +++++++++++++++--
 mm/page_io.c        |  9 ++-------
 4 files changed, 22 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index 75c6058eabf2..7c7176f2d1e7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1006,11 +1006,8 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 		}
 
 		/* ... iov_iter */
-		from.type = ITER_BVEC | WRITE;
-		from.bvec = array;
-		from.nr_segs = n;
-		from.count = sd.total_len - left;
-		from.iov_offset = 0;
+		iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
+			      sd.total_len - left);
 
 		/* ... and iocb */
 		init_sync_kiocb(&kiocb, out);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 1c5e453f7ea9..b447402d9737 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -88,7 +88,9 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
 void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
 			unsigned long nr_segs, size_t count);
-void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *iov,
+void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec,
+			unsigned long nr_segs, size_t count);
+void iov_iter_bvec(struct iov_iter *i, int direction, const struct bio_vec *bvec,
 			unsigned long nr_segs, size_t count);
 ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
 			size_t maxsize, unsigned maxpages, size_t *start);
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index a1599ca4ab0e..827732047da1 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -501,18 +501,31 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
 EXPORT_SYMBOL(iov_iter_single_seg_count);
 
 void iov_iter_kvec(struct iov_iter *i, int direction,
-			const struct kvec *iov, unsigned long nr_segs,
+			const struct kvec *kvec, unsigned long nr_segs,
 			size_t count)
 {
 	BUG_ON(!(direction & ITER_KVEC));
 	i->type = direction;
-	i->kvec = (struct kvec *)iov;
+	i->kvec = kvec;
 	i->nr_segs = nr_segs;
 	i->iov_offset = 0;
 	i->count = count;
 }
 EXPORT_SYMBOL(iov_iter_kvec);
 
+void iov_iter_bvec(struct iov_iter *i, int direction,
+			const struct bio_vec *bvec, unsigned long nr_segs,
+			size_t count)
+{
+	BUG_ON(!(direction & ITER_BVEC));
+	i->type = direction;
+	i->bvec = bvec;
+	i->nr_segs = nr_segs;
+	i->iov_offset = 0;
+	i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_bvec);
+
 unsigned long iov_iter_alignment(const struct iov_iter *i)
 {
 	unsigned long res = 0;
diff --git a/mm/page_io.c b/mm/page_io.c
index 955db8b0d497..e6045804c8d8 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -269,14 +269,9 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 			.bv_len  = PAGE_SIZE,
 			.bv_offset = 0
 		};
-		struct iov_iter from = {
-			.type = ITER_BVEC | WRITE,
-			.count = PAGE_SIZE,
-			.iov_offset = 0,
-			.nr_segs = 1,
-		};
-		from.bvec = &bv;	/* older gcc versions are broken */
+		struct iov_iter from;
 
+		iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
 		init_sync_kiocb(&kiocb, swap_file);
 		kiocb.ki_pos = page_file_offset(page);
 		kiocb.ki_nbytes = PAGE_SIZE;
-- 
cgit v1.2.3


From dbe4e192a234cd6133d86fffb965d0f032c12ccc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 25 Jan 2015 21:11:59 +0100
Subject: fs: add vfs_iter_{read,write} helpers

Simple helpers that pass an arbitrary iov_iter to filesystems.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/read_write.c    | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/splice.c        | 16 ++--------------
 include/linux/fs.h |  3 +++
 3 files changed, 51 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/read_write.c b/fs/read_write.c
index c0805c93b6fa..ab4f26a7d5cb 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -333,6 +333,52 @@ out_putf:
 }
 #endif
 
+ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	if (!file->f_op->read_iter)
+		return -EINVAL;
+
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
+	kiocb.ki_nbytes = iov_iter_count(iter);
+
+	iter->type |= READ;
+	ret = file->f_op->read_iter(&kiocb, iter);
+	if (ret == -EIOCBQUEUED)
+		ret = wait_on_sync_kiocb(&kiocb);
+
+	if (ret > 0)
+		*ppos = kiocb.ki_pos;
+	return ret;
+}
+EXPORT_SYMBOL(vfs_iter_read);
+
+ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	if (!file->f_op->write_iter)
+		return -EINVAL;
+
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
+	kiocb.ki_nbytes = iov_iter_count(iter);
+
+	iter->type |= WRITE;
+	ret = file->f_op->write_iter(&kiocb, iter);
+	if (ret == -EIOCBQUEUED)
+		ret = wait_on_sync_kiocb(&kiocb);
+
+	if (ret > 0)
+		*ppos = kiocb.ki_pos;
+	return ret;
+}
+EXPORT_SYMBOL(vfs_iter_write);
+
 /*
  * rw_verify_area doesn't like huge counts. We limit
  * them to something that fits in "int" so that others
diff --git a/fs/splice.c b/fs/splice.c
index 7c7176f2d1e7..7968da96bebb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -961,7 +961,6 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	splice_from_pipe_begin(&sd);
 	while (sd.total_len) {
 		struct iov_iter from;
-		struct kiocb kiocb;
 		size_t left;
 		int n, idx;
 
@@ -1005,26 +1004,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 			left -= this_len;
 		}
 
-		/* ... iov_iter */
 		iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
 			      sd.total_len - left);
-
-		/* ... and iocb */
-		init_sync_kiocb(&kiocb, out);
-		kiocb.ki_pos = sd.pos;
-		kiocb.ki_nbytes = sd.total_len - left;
-
-		/* now, send it */
-		ret = out->f_op->write_iter(&kiocb, &from);
-		if (-EIOCBQUEUED == ret)
-			ret = wait_on_sync_kiocb(&kiocb);
-
+		ret = vfs_iter_write(out, &from, &sd.pos);
 		if (ret <= 0)
 			break;
 
 		sd.num_spliced += ret;
 		sd.total_len -= ret;
-		*ppos = sd.pos = kiocb.ki_pos;
+		*ppos = sd.pos;
 
 		/* dismiss the fully eaten buffers, adjust the partial one */
 		while (ret) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42efe13077b6..1f3c43952540 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2494,6 +2494,9 @@ extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t l
 extern ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
 
+ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos);
+ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos);
+
 /* fs/block_dev.c */
 extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from);
-- 
cgit v1.2.3


From 3b0f1d01e501792d8d89ab4371bc9e8cd2a10032 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Wed, 28 Jan 2015 10:54:23 +0800
Subject: KVM: Rename kvm_arch_mmu_write_protect_pt_masked to be more generic
 for log dirty

We don't have to write protect guest memory for dirty logging if architecture
supports hardware dirty logging, such as PML on VMX, so rename it to be more
generic.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/kvm/mmu.c       | 18 ++++++++++++++++--
 arch/x86/kvm/mmu.c       | 21 +++++++++++++++++++--
 include/linux/kvm_host.h |  2 +-
 virt/kvm/kvm_main.c      |  2 +-
 4 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 74aeabaa3c4d..6034697ede3f 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1081,7 +1081,7 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 }
 
 /**
- * kvm_arch_mmu_write_protect_pt_masked() - write protect dirty pages
+ * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
  * @kvm:	The KVM pointer
  * @slot:	The memory slot associated with mask
  * @gfn_offset:	The gfn offset in memory slot
@@ -1091,7 +1091,7 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
  * Walks bits set in mask write protects the associated pte's. Caller must
  * acquire kvm_mmu_lock.
  */
-void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 		struct kvm_memory_slot *slot,
 		gfn_t gfn_offset, unsigned long mask)
 {
@@ -1102,6 +1102,20 @@ void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
 	stage2_wp_range(kvm, start, end);
 }
 
+/*
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * dirty pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+		struct kvm_memory_slot *slot,
+		gfn_t gfn_offset, unsigned long mask)
+{
+	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0ed9f795e4f0..b18e65ce3683 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1216,7 +1216,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
 }
 
 /**
- * kvm_arch_mmu_write_protect_pt_masked - write protect selected PT level pages
+ * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
  * @kvm: kvm instance
  * @slot: slot to protect
  * @gfn_offset: start of the BITS_PER_LONG pages we care about
@@ -1225,7 +1225,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
  * Used when we do not need to care about huge page mappings: e.g. during dirty
  * logging we do not have any such mappings.
  */
-void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 				     struct kvm_memory_slot *slot,
 				     gfn_t gfn_offset, unsigned long mask)
 {
@@ -1241,6 +1241,23 @@ void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
 	}
 }
 
+/**
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * PT level pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+				struct kvm_memory_slot *slot,
+				gfn_t gfn_offset, unsigned long mask)
+{
+	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
 static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
 	struct kvm_memory_slot *slot;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d6719522f1f..32d057571bf6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -615,7 +615,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
 int kvm_get_dirty_log_protect(struct kvm *kvm,
 			struct kvm_dirty_log *log, bool *is_dirty);
 
-void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 					struct kvm_memory_slot *slot,
 					gfn_t gfn_offset,
 					unsigned long mask);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a8490f084483..0c281760a1c5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1059,7 +1059,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
 		dirty_bitmap_buffer[i] = mask;
 
 		offset = i * BITS_PER_LONG;
-		kvm_arch_mmu_write_protect_pt_masked(kvm, memslot, offset,
+		kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset,
 								mask);
 	}
 
-- 
cgit v1.2.3


From 80b2502cea34a965a6b3390691854e753945ca5f Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@freescale.com>
Date: Wed, 28 Jan 2015 16:32:24 +0800
Subject: usb: gadget: introduce is_selfpowered for usb_gadget

Whether the gadget is selfpowerwed or not can be determined by composite
core, so we can use a common entry to indicate if the self-powered
is supported by gadget, and the related private variable at individual
udc driver can be deleted.

Signed-off-by: Peter Chen <peter.chen@freescale.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/usb/gadget.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 70ddb3943b62..e2f00fd8cd47 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -523,6 +523,7 @@ struct usb_gadget_ops {
  *	enabled HNP support.
  * @quirk_ep_out_aligned_size: epout requires buffer size to be aligned to
  *	MaxPacketSize.
+ * @is_selfpowered: if the gadget is self-powered.
  *
  * Gadgets have a mostly-portable "gadget driver" implementing device
  * functions, handling all usb configurations and interfaces.  Gadget
@@ -563,6 +564,7 @@ struct usb_gadget {
 	unsigned			a_hnp_support:1;
 	unsigned			a_alt_hnp_support:1;
 	unsigned			quirk_ep_out_aligned_size:1;
+	unsigned			is_selfpowered:1;
 };
 #define work_to_gadget(w)	(container_of((w), struct usb_gadget, work))
 
-- 
cgit v1.2.3


From ac3dd5bd128b1d1ce2a037775766f39d06a4848a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 22 Jan 2015 12:07:58 -0700
Subject: NVMe: avoid kmalloc/kfree for smaller IO

Currently we allocate an nvme_iod for each IO, which holds the
sg list, prps, and other IO related info. Set a threshold of
2 pages and/or 8KB of data, below which we can just embed this
in the per-command pdu in blk-mq. For any IO at or below
NVME_INT_PAGES and NVME_INT_BYTES, we save a kmalloc and kfree.

For higher IOPS, this saves up to 1% of CPU time.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
---
 drivers/block/nvme-core.c | 119 ++++++++++++++++++++++++++++++++++------------
 include/linux/nvme.h      |   3 +-
 2 files changed, 89 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f4aa64160838..3eaa0becc52d 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -144,8 +144,37 @@ struct nvme_cmd_info {
 	void *ctx;
 	int aborted;
 	struct nvme_queue *nvmeq;
+	struct nvme_iod iod[0];
 };
 
+/*
+ * Max size of iod being embedded in the request payload
+ */
+#define NVME_INT_PAGES		2
+#define NVME_INT_BYTES(dev)	(NVME_INT_PAGES * (dev)->page_size)
+
+/*
+ * Will slightly overestimate the number of pages needed.  This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size, struct nvme_dev *dev)
+{
+	unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
+	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+{
+	unsigned int ret = sizeof(struct nvme_cmd_info);
+
+	ret += sizeof(struct nvme_iod);
+	ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
+	ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
+
+	return ret;
+}
+
 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 				unsigned int hctx_idx)
 {
@@ -217,6 +246,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
 	cmd->aborted = 0;
 }
 
+static void *iod_get_private(struct nvme_iod *iod)
+{
+	return (void *) (iod->private & ~0x1UL);
+}
+
+/*
+ * If bit 0 is set, the iod is embedded in the request payload.
+ */
+static bool iod_should_kfree(struct nvme_iod *iod)
+{
+	return (iod->private & 0x01) == 0;
+}
+
 /* Special values must be less than 0x1000 */
 #define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
 #define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
@@ -360,35 +402,53 @@ static __le64 **iod_list(struct nvme_iod *iod)
 	return ((void *)iod) + iod->offset;
 }
 
-/*
- * Will slightly overestimate the number of pages needed.  This is OK
- * as it only leads to a small amount of wasted memory for the lifetime of
- * the I/O.
- */
-static int nvme_npages(unsigned size, struct nvme_dev *dev)
+static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
+			    unsigned nseg, unsigned long private)
 {
-	unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
-	return DIV_ROUND_UP(8 * nprps, dev->page_size - 8);
+	iod->private = private;
+	iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+	iod->npages = -1;
+	iod->length = nbytes;
+	iod->nents = 0;
 }
 
 static struct nvme_iod *
-nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
+__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
+		 unsigned long priv, gfp_t gfp)
 {
 	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
-				sizeof(__le64 *) * nvme_npages(nbytes, dev) +
+				sizeof(__le64 *) * nvme_npages(bytes, dev) +
 				sizeof(struct scatterlist) * nseg, gfp);
 
-	if (iod) {
-		iod->offset = offsetof(struct nvme_iod, sg[nseg]);
-		iod->npages = -1;
-		iod->length = nbytes;
-		iod->nents = 0;
-		iod->first_dma = 0ULL;
-	}
+	if (iod)
+		iod_init(iod, bytes, nseg, priv);
 
 	return iod;
 }
 
+static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
+			               gfp_t gfp)
+{
+	unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
+                                                sizeof(struct nvme_dsm_range);
+	unsigned long mask = 0;
+	struct nvme_iod *iod;
+
+	if (rq->nr_phys_segments <= NVME_INT_PAGES &&
+	    size <= NVME_INT_BYTES(dev)) {
+		struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
+
+		iod = cmd->iod;
+		mask = 0x01;
+		iod_init(iod, size, rq->nr_phys_segments,
+				(unsigned long) rq | 0x01);
+		return iod;
+	}
+
+	return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
+				(unsigned long) rq, gfp);
+}
+
 void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 {
 	const int last_prp = dev->page_size / 8 - 1;
@@ -404,7 +464,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
 		prp_dma = next_prp_dma;
 	}
-	kfree(iod);
+
+	if (iod_should_kfree(iod))
+		kfree(iod);
 }
 
 static int nvme_error_status(u16 status)
@@ -423,7 +485,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct nvme_iod *iod = ctx;
-	struct request *req = iod->private;
+	struct request *req = iod_get_private(iod);
 	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
 
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
@@ -579,7 +641,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 							struct nvme_ns *ns)
 {
-	struct request *req = iod->private;
+	struct request *req = iod_get_private(iod);
 	struct nvme_command *cmnd;
 	u16 control = 0;
 	u32 dsmgmt = 0;
@@ -620,17 +682,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request *req = bd->rq;
 	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
 	struct nvme_iod *iod;
-	int psegs = req->nr_phys_segments;
 	enum dma_data_direction dma_dir;
-	unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
-						sizeof(struct nvme_dsm_range);
 
-	iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
+	iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
 	if (!iod)
 		return BLK_MQ_RQ_QUEUE_BUSY;
 
-	iod->private = req;
-
 	if (req->cmd_flags & REQ_DISCARD) {
 		void *range;
 		/*
@@ -645,10 +702,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 			goto retry_cmd;
 		iod_list(iod)[0] = (__le64 *)range;
 		iod->npages = 0;
-	} else if (psegs) {
+	} else if (req->nr_phys_segments) {
 		dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
 
-		sg_init_table(iod->sg, psegs);
+		sg_init_table(iod->sg, req->nr_phys_segments);
 		iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
 		if (!iod->nents)
 			goto error_cmd;
@@ -1362,7 +1419,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
 		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
 		dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
-		dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
+		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
 		dev->admin_tagset.driver_data = dev;
 
 		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
@@ -1483,7 +1540,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 	}
 
 	err = -ENOMEM;
-	iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL);
+	iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
 	if (!iod)
 		goto put_pages;
 
@@ -2109,7 +2166,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
 	dev->tagset.queue_depth =
 				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
-	dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
+	dev->tagset.cmd_size = nvme_cmd_size(dev);
 	dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
 	dev->tagset.driver_data = dev;
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 258945fcabf1..19a5d4b23209 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -132,13 +132,12 @@ struct nvme_ns {
  * allocated to store the PRP list.
  */
 struct nvme_iod {
-	void *private;		/* For the use of the submitter of the I/O */
+	unsigned long private;	/* For the use of the submitter of the I/O */
 	int npages;		/* In the PRP list. 0 means small pool in use */
 	int offset;		/* Of PRP list */
 	int nents;		/* Used in scatterlist */
 	int length;		/* Of data, in bytes */
 	dma_addr_t first_dma;
-	struct list_head node;
 	struct scatterlist sg[0];
 };
 
-- 
cgit v1.2.3


From 50dd64d57eee8aed1a86e875ce77d078e44fad00 Mon Sep 17 00:00:00 2001
From: Karol Wrona <k.wrona@samsung.com>
Date: Wed, 28 Jan 2015 15:05:50 +0100
Subject: iio: common: ssp_sensors: Add sensorhub driver

Sensorhub  is MCU dedicated to collect data and manage several sensors.
Sensorhub is a spi device which provides a layer for IIO devices. It provides
some data parsing and common mechanism for sensorhub sensors.

Adds common sensorhub library for sensorhub driver and iio drivers
which uses sensorhub MCU to communicate with sensors.

Signed-off-by: Karol Wrona <k.wrona@samsung.com>
Acked-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/common/Kconfig               |   1 +
 drivers/iio/common/Makefile              |   1 +
 drivers/iio/common/ssp_sensors/Kconfig   |  16 +
 drivers/iio/common/ssp_sensors/Makefile  |   6 +
 drivers/iio/common/ssp_sensors/ssp.h     | 257 +++++++++++
 drivers/iio/common/ssp_sensors/ssp_dev.c | 712 +++++++++++++++++++++++++++++++
 drivers/iio/common/ssp_sensors/ssp_spi.c | 608 ++++++++++++++++++++++++++
 include/linux/iio/common/ssp_sensors.h   |  82 ++++
 8 files changed, 1683 insertions(+)
 create mode 100644 drivers/iio/common/ssp_sensors/Kconfig
 create mode 100644 drivers/iio/common/ssp_sensors/Makefile
 create mode 100644 drivers/iio/common/ssp_sensors/ssp.h
 create mode 100644 drivers/iio/common/ssp_sensors/ssp_dev.c
 create mode 100644 drivers/iio/common/ssp_sensors/ssp_spi.c
 create mode 100644 include/linux/iio/common/ssp_sensors.h

(limited to 'include/linux')

diff --git a/drivers/iio/common/Kconfig b/drivers/iio/common/Kconfig
index 0b6e97d18fa0..790f106d719c 100644
--- a/drivers/iio/common/Kconfig
+++ b/drivers/iio/common/Kconfig
@@ -3,4 +3,5 @@
 #
 
 source "drivers/iio/common/hid-sensors/Kconfig"
+source "drivers/iio/common/ssp_sensors/Kconfig"
 source "drivers/iio/common/st_sensors/Kconfig"
diff --git a/drivers/iio/common/Makefile b/drivers/iio/common/Makefile
index 3112df0060e9..b1e4d9c9591c 100644
--- a/drivers/iio/common/Makefile
+++ b/drivers/iio/common/Makefile
@@ -8,4 +8,5 @@
 
 # When adding new entries keep the list in alphabetical order
 obj-y += hid-sensors/
+obj-y += ssp_sensors/
 obj-y += st_sensors/
diff --git a/drivers/iio/common/ssp_sensors/Kconfig b/drivers/iio/common/ssp_sensors/Kconfig
new file mode 100644
index 000000000000..759b975e2b40
--- /dev/null
+++ b/drivers/iio/common/ssp_sensors/Kconfig
@@ -0,0 +1,16 @@
+#
+# SSP sensor drivers and commons configuration
+#
+menu "SSP Sensor Common"
+
+config IIO_SSP_SENSORHUB
+	tristate "Samsung Sensorhub driver"
+	depends on SPI
+	select MFD_CORE
+	help
+	  SSP driver for sensorhub.
+	  If you say yes here you get ssp support for sensorhub.
+	  To compile this driver as a module, choose M here: the
+	  module will be called sensorhub.
+
+endmenu
diff --git a/drivers/iio/common/ssp_sensors/Makefile b/drivers/iio/common/ssp_sensors/Makefile
new file mode 100644
index 000000000000..07d3d6a6763b
--- /dev/null
+++ b/drivers/iio/common/ssp_sensors/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for SSP sensor drivers and commons.
+#
+
+sensorhub-objs				:= ssp_dev.o ssp_spi.o
+obj-$(CONFIG_IIO_SSP_SENSORHUB)		+= sensorhub.o
diff --git a/drivers/iio/common/ssp_sensors/ssp.h b/drivers/iio/common/ssp_sensors/ssp.h
new file mode 100644
index 000000000000..b910e91d7c0d
--- /dev/null
+++ b/drivers/iio/common/ssp_sensors/ssp.h
@@ -0,0 +1,257 @@
+/*
+ *  Copyright (C) 2014, Samsung Electronics Co. Ltd. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#ifndef __SSP_SENSORHUB_H__
+#define __SSP_SENSORHUB_H__
+
+#include <linux/delay.h>
+#include <linux/gpio.h>
+#include <linux/iio/common/ssp_sensors.h>
+#include <linux/iio/iio.h>
+#include <linux/spi/spi.h>
+
+#define SSP_DEVICE_ID		0x55
+
+#ifdef SSP_DBG
+#define ssp_dbg(format, ...) pr_info("[SSP] "format, ##__VA_ARGS__)
+#else
+#define ssp_dbg(format, ...)
+#endif
+
+#define SSP_SW_RESET_TIME		3000
+/* Sensor polling in ms */
+#define SSP_DEFAULT_POLLING_DELAY	200
+#define SSP_DEFAULT_RETRIES		3
+#define SSP_DATA_PACKET_SIZE		960
+#define SSP_HEADER_BUFFER_SIZE		4
+
+enum {
+	SSP_KERNEL_BINARY = 0,
+	SSP_KERNEL_CRASHED_BINARY,
+};
+
+enum {
+	SSP_INITIALIZATION_STATE = 0,
+	SSP_NO_SENSOR_STATE,
+	SSP_ADD_SENSOR_STATE,
+	SSP_RUNNING_SENSOR_STATE,
+};
+
+/* Firmware download STATE */
+enum {
+	SSP_FW_DL_STATE_FAIL = -1,
+	SSP_FW_DL_STATE_NONE = 0,
+	SSP_FW_DL_STATE_NEED_TO_SCHEDULE,
+	SSP_FW_DL_STATE_SCHEDULED,
+	SSP_FW_DL_STATE_DOWNLOADING,
+	SSP_FW_DL_STATE_SYNC,
+	SSP_FW_DL_STATE_DONE,
+};
+
+#define SSP_INVALID_REVISION			99999
+#define SSP_INVALID_REVISION2			0xffffff
+
+/* AP -> SSP Instruction */
+#define SSP_MSG2SSP_INST_BYPASS_SENSOR_ADD	0xa1
+#define SSP_MSG2SSP_INST_BYPASS_SENSOR_RM	0xa2
+#define SSP_MSG2SSP_INST_REMOVE_ALL		0xa3
+#define SSP_MSG2SSP_INST_CHANGE_DELAY		0xa4
+#define SSP_MSG2SSP_INST_LIBRARY_ADD		0xb1
+#define SSP_MSG2SSP_INST_LIBRARY_REMOVE		0xb2
+#define SSP_MSG2SSP_INST_LIB_NOTI		0xb4
+#define SSP_MSG2SSP_INST_LIB_DATA		0xc1
+
+#define SSP_MSG2SSP_AP_MCU_SET_GYRO_CAL		0xcd
+#define SSP_MSG2SSP_AP_MCU_SET_ACCEL_CAL	0xce
+#define SSP_MSG2SSP_AP_STATUS_SHUTDOWN		0xd0
+#define SSP_MSG2SSP_AP_STATUS_WAKEUP		0xd1
+#define SSP_MSG2SSP_AP_STATUS_SLEEP		0xd2
+#define SSP_MSG2SSP_AP_STATUS_RESUME		0xd3
+#define SSP_MSG2SSP_AP_STATUS_SUSPEND		0xd4
+#define SSP_MSG2SSP_AP_STATUS_RESET		0xd5
+#define SSP_MSG2SSP_AP_STATUS_POW_CONNECTED	0xd6
+#define SSP_MSG2SSP_AP_STATUS_POW_DISCONNECTED	0xd7
+#define SSP_MSG2SSP_AP_TEMPHUMIDITY_CAL_DONE	0xda
+#define SSP_MSG2SSP_AP_MCU_SET_DUMPMODE		0xdb
+#define SSP_MSG2SSP_AP_MCU_DUMP_CHECK		0xdc
+#define SSP_MSG2SSP_AP_MCU_BATCH_FLUSH		0xdd
+#define SSP_MSG2SSP_AP_MCU_BATCH_COUNT		0xdf
+
+#define SSP_MSG2SSP_AP_WHOAMI				0x0f
+#define SSP_MSG2SSP_AP_FIRMWARE_REV			0xf0
+#define SSP_MSG2SSP_AP_SENSOR_FORMATION			0xf1
+#define SSP_MSG2SSP_AP_SENSOR_PROXTHRESHOLD		0xf2
+#define SSP_MSG2SSP_AP_SENSOR_BARCODE_EMUL		0xf3
+#define SSP_MSG2SSP_AP_SENSOR_SCANNING			0xf4
+#define SSP_MSG2SSP_AP_SET_MAGNETIC_HWOFFSET		0xf5
+#define SSP_MSG2SSP_AP_GET_MAGNETIC_HWOFFSET		0xf6
+#define SSP_MSG2SSP_AP_SENSOR_GESTURE_CURRENT		0xf7
+#define SSP_MSG2SSP_AP_GET_THERM			0xf8
+#define SSP_MSG2SSP_AP_GET_BIG_DATA			0xf9
+#define SSP_MSG2SSP_AP_SET_BIG_DATA			0xfa
+#define SSP_MSG2SSP_AP_START_BIG_DATA			0xfb
+#define SSP_MSG2SSP_AP_SET_MAGNETIC_STATIC_MATRIX	0xfd
+#define SSP_MSG2SSP_AP_SENSOR_TILT			0xea
+#define SSP_MSG2SSP_AP_MCU_SET_TIME			0xfe
+#define SSP_MSG2SSP_AP_MCU_GET_TIME			0xff
+
+#define SSP_MSG2SSP_AP_FUSEROM				0x01
+
+/* voice data */
+#define SSP_TYPE_WAKE_UP_VOICE_SERVICE			0x01
+#define SSP_TYPE_WAKE_UP_VOICE_SOUND_SOURCE_AM		0x01
+#define SSP_TYPE_WAKE_UP_VOICE_SOUND_SOURCE_GRAMMER	0x02
+
+/* Factory Test */
+#define SSP_ACCELEROMETER_FACTORY			0x80
+#define SSP_GYROSCOPE_FACTORY				0x81
+#define SSP_GEOMAGNETIC_FACTORY				0x82
+#define SSP_PRESSURE_FACTORY				0x85
+#define SSP_GESTURE_FACTORY				0x86
+#define SSP_TEMPHUMIDITY_CRC_FACTORY			0x88
+#define SSP_GYROSCOPE_TEMP_FACTORY			0x8a
+#define SSP_GYROSCOPE_DPS_FACTORY			0x8b
+#define SSP_MCU_FACTORY					0x8c
+#define SSP_MCU_SLEEP_FACTORY				0x8d
+
+/* SSP -> AP ACK about write CMD */
+#define SSP_MSG_ACK		0x80	/* ACK from SSP to AP */
+#define SSP_MSG_NAK		0x70	/* NAK from SSP to AP */
+
+struct ssp_sensorhub_info {
+	char *fw_name;
+	char *fw_crashed_name;
+	unsigned int fw_rev;
+	const u8 * const mag_table;
+	const unsigned int mag_length;
+};
+
+/* ssp_msg options bit */
+#define SSP_RW		0
+#define SSP_INDEX	3
+
+#define SSP_AP2HUB_READ		0
+#define SSP_AP2HUB_WRITE	1
+#define SSP_HUB2AP_WRITE	2
+#define SSP_AP2HUB_READY	3
+#define SSP_AP2HUB_RETURN	4
+
+/**
+ * struct ssp_data - ssp platformdata structure
+ * @spi:		spi device
+ * @sensorhub_info:	info about sensorhub board specific features
+ * @wdt_timer:		watchdog timer
+ * @work_wdt:		watchdog work
+ * @work_firmware:	firmware upgrade work queue
+ * @work_refresh:	refresh work queue for reset request from MCU
+ * @shut_down:		shut down flag
+ * @mcu_dump_mode:	mcu dump mode for debug
+ * @time_syncing:	time syncing indication flag
+ * @timestamp:		previous time in ns calculated for time syncing
+ * @check_status:	status table for each sensor
+ * @com_fail_cnt:	communication fail count
+ * @reset_cnt:		reset count
+ * @timeout_cnt:	timeout count
+ * @available_sensors:	available sensors seen by sensorhub (bit array)
+ * @cur_firm_rev:	cached current firmware revision
+ * @last_resume_state:	last AP resume/suspend state used to handle the PM
+ *                      state of ssp
+ * @last_ap_state:	(obsolete) sleep notification for MCU
+ * @sensor_enable:	sensor enable mask
+ * @delay_buf:		data acquisition intervals table
+ * @batch_latency_buf:	yet unknown but existing in communication protocol
+ * @batch_opt_buf:	yet unknown but existing in communication protocol
+ * @accel_position:	yet unknown but existing in communication protocol
+ * @mag_position:	yet unknown but existing in communication protocol
+ * @fw_dl_state:	firmware download state
+ * @comm_lock:		lock protecting the handshake
+ * @pending_lock:	lock protecting pending list and completion
+ * @mcu_reset_gpio:	mcu reset line
+ * @ap_mcu_gpio:	ap to mcu gpio line
+ * @mcu_ap_gpio:	mcu to ap gpio line
+ * @pending_list:	pending list for messages queued to be sent/read
+ * @sensor_devs:	registered IIO devices table
+ * @enable_refcount:	enable reference count for wdt (watchdog timer)
+ * @header_buffer:	cache aligned buffer for packet header
+ */
+struct ssp_data {
+	struct spi_device *spi;
+	struct ssp_sensorhub_info *sensorhub_info;
+	struct timer_list wdt_timer;
+	struct work_struct work_wdt;
+	struct delayed_work work_refresh;
+
+	bool shut_down;
+	bool mcu_dump_mode;
+	bool time_syncing;
+	int64_t timestamp;
+
+	int check_status[SSP_SENSOR_MAX];
+
+	unsigned int com_fail_cnt;
+	unsigned int reset_cnt;
+	unsigned int timeout_cnt;
+
+	unsigned int available_sensors;
+	unsigned int cur_firm_rev;
+
+	char last_resume_state;
+	char last_ap_state;
+
+	unsigned int sensor_enable;
+	u32 delay_buf[SSP_SENSOR_MAX];
+	s32 batch_latency_buf[SSP_SENSOR_MAX];
+	s8 batch_opt_buf[SSP_SENSOR_MAX];
+
+	int accel_position;
+	int mag_position;
+	int fw_dl_state;
+
+	struct mutex comm_lock;
+	struct mutex pending_lock;
+
+	int mcu_reset_gpio;
+	int ap_mcu_gpio;
+	int mcu_ap_gpio;
+
+	struct list_head pending_list;
+
+	struct iio_dev *sensor_devs[SSP_SENSOR_MAX];
+	atomic_t enable_refcount;
+
+	__le16 header_buffer[SSP_HEADER_BUFFER_SIZE / sizeof(__le16)]
+		____cacheline_aligned;
+};
+
+void ssp_clean_pending_list(struct ssp_data *data);
+
+int ssp_command(struct ssp_data *data, char command, int arg);
+
+int ssp_send_instruction(struct ssp_data *data, u8 inst, u8 sensor_type,
+			 u8 *send_buf, u8 length);
+
+int ssp_irq_msg(struct ssp_data *data);
+
+int ssp_get_chipid(struct ssp_data *data);
+
+int ssp_set_magnetic_matrix(struct ssp_data *data);
+
+unsigned int ssp_get_sensor_scanning_info(struct ssp_data *data);
+
+unsigned int ssp_get_firmware_rev(struct ssp_data *data);
+
+int ssp_queue_ssp_refresh_task(struct ssp_data *data, unsigned int delay);
+
+#endif /* __SSP_SENSORHUB_H__ */
diff --git a/drivers/iio/common/ssp_sensors/ssp_dev.c b/drivers/iio/common/ssp_sensors/ssp_dev.c
new file mode 100644
index 000000000000..52d70435f5a1
--- /dev/null
+++ b/drivers/iio/common/ssp_sensors/ssp_dev.c
@@ -0,0 +1,712 @@
+/*
+ *  Copyright (C) 2014, Samsung Electronics Co. Ltd. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#include <linux/iio/iio.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/mfd/core.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_gpio.h>
+#include <linux/of_platform.h>
+#include "ssp.h"
+
+#define SSP_WDT_TIME			10000
+#define SSP_LIMIT_RESET_CNT		20
+#define SSP_LIMIT_TIMEOUT_CNT		3
+
+/* It is possible that it is max clk rate for version 1.0 of bootcode */
+#define SSP_BOOT_SPI_HZ	400000
+
+/*
+ * These fields can look enigmatic but this structure is used mainly to flat
+ * some values and depends on command type.
+ */
+struct ssp_instruction {
+	__le32 a;
+	__le32 b;
+	u8 c;
+} __attribute__((__packed__));
+
+static const u8 ssp_magnitude_table[] = {110, 85, 171, 71, 203, 195, 0, 67,
+	208, 56, 175, 244, 206, 213, 0, 92, 250, 0, 55, 48, 189, 252, 171,
+	243, 13, 45, 250};
+
+static const struct ssp_sensorhub_info ssp_rinato_info = {
+	.fw_name = "ssp_B2.fw",
+	.fw_crashed_name = "ssp_crashed.fw",
+	.fw_rev = 14052300,
+	.mag_table = ssp_magnitude_table,
+	.mag_length = ARRAY_SIZE(ssp_magnitude_table),
+};
+
+static const struct ssp_sensorhub_info ssp_thermostat_info = {
+	.fw_name = "thermostat_B2.fw",
+	.fw_crashed_name = "ssp_crashed.fw",
+	.fw_rev = 14080600,
+	.mag_table = ssp_magnitude_table,
+	.mag_length = ARRAY_SIZE(ssp_magnitude_table),
+};
+
+static const struct mfd_cell sensorhub_sensor_devs[] = {
+	{
+		.name = "ssp-accelerometer",
+	},
+	{
+		.name = "ssp-gyroscope",
+	},
+};
+
+static void ssp_toggle_mcu_reset_gpio(struct ssp_data *data)
+{
+	gpio_set_value(data->mcu_reset_gpio, 0);
+	usleep_range(1000, 1200);
+	gpio_set_value(data->mcu_reset_gpio, 1);
+	msleep(50);
+}
+
+static void ssp_sync_available_sensors(struct ssp_data *data)
+{
+	int i, ret;
+
+	for (i = 0; i < SSP_SENSOR_MAX; ++i) {
+		if (data->available_sensors & BIT(i)) {
+			ret = ssp_enable_sensor(data, i, data->delay_buf[i]);
+			if (ret < 0) {
+				dev_err(&data->spi->dev,
+					"Sync sensor nr: %d fail\n", i);
+				continue;
+			}
+		}
+	}
+
+	ret = ssp_command(data, SSP_MSG2SSP_AP_MCU_SET_DUMPMODE,
+			  data->mcu_dump_mode);
+	if (ret < 0)
+		dev_err(&data->spi->dev,
+			"SSP_MSG2SSP_AP_MCU_SET_DUMPMODE failed\n");
+}
+
+static void ssp_enable_mcu(struct ssp_data *data, bool enable)
+{
+	dev_info(&data->spi->dev, "current shutdown = %d, old = %d\n", enable,
+		 data->shut_down);
+
+	if (enable && data->shut_down) {
+		data->shut_down = false;
+		enable_irq(data->spi->irq);
+		enable_irq_wake(data->spi->irq);
+	} else if (!enable && !data->shut_down) {
+		data->shut_down = true;
+		disable_irq(data->spi->irq);
+		disable_irq_wake(data->spi->irq);
+	} else {
+		dev_warn(&data->spi->dev, "current shutdown = %d, old = %d\n",
+			 enable, data->shut_down);
+	}
+}
+
+/*
+ * This function is the first one which communicates with the mcu so it is
+ * possible that the first attempt will fail
+ */
+static int ssp_check_fwbl(struct ssp_data *data)
+{
+	int retries = 0;
+
+	while (retries++ < 5) {
+		data->cur_firm_rev = ssp_get_firmware_rev(data);
+		if (data->cur_firm_rev == SSP_INVALID_REVISION ||
+		    data->cur_firm_rev == SSP_INVALID_REVISION2) {
+			dev_warn(&data->spi->dev,
+				 "Invalid revision, trying %d time\n", retries);
+		} else {
+			break;
+		}
+	}
+
+	if (data->cur_firm_rev == SSP_INVALID_REVISION ||
+	    data->cur_firm_rev == SSP_INVALID_REVISION2) {
+		dev_err(&data->spi->dev, "SSP_INVALID_REVISION\n");
+		return SSP_FW_DL_STATE_NEED_TO_SCHEDULE;
+	}
+
+	dev_info(&data->spi->dev,
+		 "MCU Firm Rev : Old = %8u, New = %8u\n",
+		 data->cur_firm_rev,
+		 data->sensorhub_info->fw_rev);
+
+	if (data->cur_firm_rev != data->sensorhub_info->fw_rev)
+		return SSP_FW_DL_STATE_NEED_TO_SCHEDULE;
+
+	return SSP_FW_DL_STATE_NONE;
+}
+
+static void ssp_reset_mcu(struct ssp_data *data)
+{
+	ssp_enable_mcu(data, false);
+	ssp_clean_pending_list(data);
+	ssp_toggle_mcu_reset_gpio(data);
+	ssp_enable_mcu(data, true);
+}
+
+static void ssp_wdt_work_func(struct work_struct *work)
+{
+	struct ssp_data *data = container_of(work, struct ssp_data, work_wdt);
+
+	dev_err(&data->spi->dev, "%s - Sensor state: 0x%x, RC: %u, CC: %u\n",
+		__func__, data->available_sensors, data->reset_cnt,
+		data->com_fail_cnt);
+
+	ssp_reset_mcu(data);
+	data->com_fail_cnt = 0;
+	data->timeout_cnt = 0;
+}
+
+static void ssp_wdt_timer_func(unsigned long ptr)
+{
+	struct ssp_data *data = (struct ssp_data *)ptr;
+
+	switch (data->fw_dl_state) {
+	case SSP_FW_DL_STATE_FAIL:
+	case SSP_FW_DL_STATE_DOWNLOADING:
+	case SSP_FW_DL_STATE_SYNC:
+		goto _mod;
+	}
+
+	if (data->timeout_cnt > SSP_LIMIT_TIMEOUT_CNT ||
+	    data->com_fail_cnt > SSP_LIMIT_RESET_CNT)
+		queue_work(system_power_efficient_wq, &data->work_wdt);
+_mod:
+	mod_timer(&data->wdt_timer, jiffies + msecs_to_jiffies(SSP_WDT_TIME));
+}
+
+static void ssp_enable_wdt_timer(struct ssp_data *data)
+{
+	mod_timer(&data->wdt_timer, jiffies + msecs_to_jiffies(SSP_WDT_TIME));
+}
+
+static void ssp_disable_wdt_timer(struct ssp_data *data)
+{
+	del_timer_sync(&data->wdt_timer);
+	cancel_work_sync(&data->work_wdt);
+}
+
+/**
+ * ssp_get_sensor_delay() - gets sensor data acquisition period
+ * @data:	sensorhub structure
+ * @type:	SSP sensor type
+ *
+ * Returns acquisition period in ms
+ */
+u32 ssp_get_sensor_delay(struct ssp_data *data, enum ssp_sensor_type type)
+{
+	return data->delay_buf[type];
+}
+EXPORT_SYMBOL(ssp_get_sensor_delay);
+
+/**
+ * ssp_enable_sensor() - enables data acquisition for sensor
+ * @data:	sensorhub structure
+ * @type:	SSP sensor type
+ * @delay:	delay in ms
+ *
+ * Returns 0 or negative value in case of error
+ */
+int ssp_enable_sensor(struct ssp_data *data, enum ssp_sensor_type type,
+		      u32 delay)
+{
+	int ret;
+	struct ssp_instruction to_send;
+
+	to_send.a = cpu_to_le32(delay);
+	to_send.b = cpu_to_le32(data->batch_latency_buf[type]);
+	to_send.c = data->batch_opt_buf[type];
+
+	switch (data->check_status[type]) {
+	case SSP_INITIALIZATION_STATE:
+		/* do calibration step, now just enable */
+	case SSP_ADD_SENSOR_STATE:
+		ret = ssp_send_instruction(data,
+					   SSP_MSG2SSP_INST_BYPASS_SENSOR_ADD,
+					   type,
+					   (u8 *)&to_send, sizeof(to_send));
+		if (ret < 0) {
+			dev_err(&data->spi->dev, "Enabling sensor failed\n");
+			data->check_status[type] = SSP_NO_SENSOR_STATE;
+			goto derror;
+		}
+
+		data->sensor_enable |= BIT(type);
+		data->check_status[type] = SSP_RUNNING_SENSOR_STATE;
+		break;
+	case SSP_RUNNING_SENSOR_STATE:
+		ret = ssp_send_instruction(data,
+					   SSP_MSG2SSP_INST_CHANGE_DELAY, type,
+					   (u8 *)&to_send, sizeof(to_send));
+		if (ret < 0) {
+			dev_err(&data->spi->dev,
+				"Changing sensor delay failed\n");
+			goto derror;
+		}
+		break;
+	default:
+		data->check_status[type] = SSP_ADD_SENSOR_STATE;
+		break;
+	}
+
+	data->delay_buf[type] = delay;
+
+	if (atomic_inc_return(&data->enable_refcount) == 1)
+		ssp_enable_wdt_timer(data);
+
+	return 0;
+
+derror:
+	return ret;
+}
+EXPORT_SYMBOL(ssp_enable_sensor);
+
+/**
+ * ssp_change_delay() - changes data acquisition for sensor
+ * @data:	sensorhub structure
+ * @type:	SSP sensor type
+ * @delay:	delay in ms
+ *
+ * Returns 0 or negative value in case of error
+ */
+int ssp_change_delay(struct ssp_data *data, enum ssp_sensor_type type,
+		     u32 delay)
+{
+	int ret;
+	struct ssp_instruction to_send;
+
+	to_send.a = cpu_to_le32(delay);
+	to_send.b = cpu_to_le32(data->batch_latency_buf[type]);
+	to_send.c = data->batch_opt_buf[type];
+
+	ret = ssp_send_instruction(data, SSP_MSG2SSP_INST_CHANGE_DELAY, type,
+				   (u8 *)&to_send, sizeof(to_send));
+	if (ret < 0) {
+		dev_err(&data->spi->dev, "Changing sensor delay failed\n");
+		return ret;
+	}
+
+	data->delay_buf[type] = delay;
+
+	return 0;
+}
+EXPORT_SYMBOL(ssp_change_delay);
+
+/**
+ * ssp_disable_sensor() - disables sensor
+ *
+ * @data:	sensorhub structure
+ * @type:	SSP sensor type
+ *
+ * Returns 0 or negative value in case of error
+ */
+int ssp_disable_sensor(struct ssp_data *data, enum ssp_sensor_type type)
+{
+	int ret;
+	__le32 command;
+
+	if (data->sensor_enable & BIT(type)) {
+		command = cpu_to_le32(data->delay_buf[type]);
+
+		ret = ssp_send_instruction(data,
+					   SSP_MSG2SSP_INST_BYPASS_SENSOR_RM,
+					   type, (u8 *)&command,
+					   sizeof(command));
+		if (ret < 0) {
+			dev_err(&data->spi->dev, "Remove sensor fail\n");
+			return ret;
+		}
+
+		data->sensor_enable &= ~BIT(type);
+	}
+
+	data->check_status[type] = SSP_ADD_SENSOR_STATE;
+
+	if (atomic_dec_and_test(&data->enable_refcount))
+		ssp_disable_wdt_timer(data);
+
+	return 0;
+}
+EXPORT_SYMBOL(ssp_disable_sensor);
+
+static irqreturn_t ssp_irq_thread_fn(int irq, void *dev_id)
+{
+	struct ssp_data *data = dev_id;
+
+	/*
+	 * This wrapper is done to preserve error path for ssp_irq_msg, also
+	 * it is defined in different file.
+	 */
+	ssp_irq_msg(data);
+
+	return IRQ_HANDLED;
+}
+
+static int ssp_initialize_mcu(struct ssp_data *data)
+{
+	int ret;
+
+	ssp_clean_pending_list(data);
+
+	ret = ssp_get_chipid(data);
+	if (ret != SSP_DEVICE_ID) {
+		dev_err(&data->spi->dev, "%s - MCU %s ret = %d\n", __func__,
+			ret < 0 ? "is not working" : "identification failed",
+			ret);
+		return ret < 0 ? ret : -ENODEV;
+	}
+
+	dev_info(&data->spi->dev, "MCU device ID = %d\n", ret);
+
+	/*
+	 * needs clarification, for now do not want to export all transfer
+	 * methods to sensors' drivers
+	 */
+	ret = ssp_set_magnetic_matrix(data);
+	if (ret < 0) {
+		dev_err(&data->spi->dev,
+			"%s - ssp_set_magnetic_matrix failed\n", __func__);
+		return ret;
+	}
+
+	data->available_sensors = ssp_get_sensor_scanning_info(data);
+	if (data->available_sensors == 0) {
+		dev_err(&data->spi->dev,
+			"%s - ssp_get_sensor_scanning_info failed\n", __func__);
+		return -EIO;
+	}
+
+	data->cur_firm_rev = ssp_get_firmware_rev(data);
+	dev_info(&data->spi->dev, "MCU Firm Rev : New = %8u\n",
+		 data->cur_firm_rev);
+
+	return ssp_command(data, SSP_MSG2SSP_AP_MCU_DUMP_CHECK, 0);
+}
+
+/*
+ * sensorhub can request its reinitialization as some brutal and rare error
+ * handling. It can be requested from the MCU.
+ */
+static void ssp_refresh_task(struct work_struct *work)
+{
+	struct ssp_data *data = container_of((struct delayed_work *)work,
+					     struct ssp_data, work_refresh);
+
+	dev_info(&data->spi->dev, "refreshing\n");
+
+	data->reset_cnt++;
+
+	if (ssp_initialize_mcu(data) >= 0) {
+		ssp_sync_available_sensors(data);
+		if (data->last_ap_state != 0)
+			ssp_command(data, data->last_ap_state, 0);
+
+		if (data->last_resume_state != 0)
+			ssp_command(data, data->last_resume_state, 0);
+
+		data->timeout_cnt = 0;
+		data->com_fail_cnt = 0;
+	}
+}
+
+int ssp_queue_ssp_refresh_task(struct ssp_data *data, unsigned int delay)
+{
+	cancel_delayed_work_sync(&data->work_refresh);
+
+	return queue_delayed_work(system_power_efficient_wq,
+				  &data->work_refresh,
+				  msecs_to_jiffies(delay));
+}
+
+#ifdef CONFIG_OF
+static struct of_device_id ssp_of_match[] = {
+	{
+		.compatible	= "samsung,sensorhub-rinato",
+		.data		= &ssp_rinato_info,
+	}, {
+		.compatible	= "samsung,sensorhub-thermostat",
+		.data		= &ssp_thermostat_info,
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(of, ssp_of_match);
+
+static struct ssp_data *ssp_parse_dt(struct device *dev)
+{
+	int ret;
+	struct ssp_data *data;
+	struct device_node *node = dev->of_node;
+	const struct of_device_id *match;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return NULL;
+
+	data->mcu_ap_gpio = of_get_named_gpio(node, "mcu-ap-gpios", 0);
+	if (data->mcu_ap_gpio < 0)
+		goto err_free_pd;
+
+	data->ap_mcu_gpio = of_get_named_gpio(node, "ap-mcu-gpios", 0);
+	if (data->ap_mcu_gpio < 0)
+		goto err_free_pd;
+
+	data->mcu_reset_gpio = of_get_named_gpio(node, "mcu-reset-gpios", 0);
+	if (data->mcu_reset_gpio < 0)
+		goto err_free_pd;
+
+	ret = devm_gpio_request_one(dev, data->ap_mcu_gpio, GPIOF_OUT_INIT_HIGH,
+				    "ap-mcu-gpios");
+	if (ret)
+		goto err_free_pd;
+
+	ret = devm_gpio_request_one(dev, data->mcu_reset_gpio,
+				    GPIOF_OUT_INIT_HIGH, "mcu-reset-gpios");
+	if (ret)
+		goto err_ap_mcu;
+
+	match = of_match_node(ssp_of_match, node);
+	if (!match)
+		goto err_mcu_reset_gpio;
+
+	data->sensorhub_info = (struct ssp_sensorhub_info *)match->data;
+
+	dev_set_drvdata(dev, data);
+
+	return data;
+
+err_mcu_reset_gpio:
+	devm_gpio_free(dev, data->mcu_reset_gpio);
+err_ap_mcu:
+	devm_gpio_free(dev, data->ap_mcu_gpio);
+err_free_pd:
+	devm_kfree(dev, data);
+	return NULL;
+}
+#else
+static struct ssp_data *ssp_parse_dt(struct device *pdev)
+{
+	return NULL;
+}
+#endif
+
+/**
+ * ssp_register_consumer() - registers iio consumer in ssp framework
+ *
+ * @indio_dev:	consumer iio device
+ * @type:	ssp sensor type
+ */
+void ssp_register_consumer(struct iio_dev *indio_dev, enum ssp_sensor_type type)
+{
+	struct ssp_data *data = dev_get_drvdata(indio_dev->dev.parent->parent);
+
+	data->sensor_devs[type] = indio_dev;
+}
+EXPORT_SYMBOL(ssp_register_consumer);
+
+static int ssp_probe(struct spi_device *spi)
+{
+	int ret, i;
+	struct ssp_data *data;
+
+	data = ssp_parse_dt(&spi->dev);
+	if (!data) {
+		dev_err(&spi->dev, "Failed to find platform data\n");
+		return -ENODEV;
+	}
+
+	ret = mfd_add_devices(&spi->dev, -1, sensorhub_sensor_devs,
+			      ARRAY_SIZE(sensorhub_sensor_devs), NULL, 0, NULL);
+	if (ret < 0) {
+		dev_err(&spi->dev, "mfd add devices fail\n");
+		return ret;
+	}
+
+	spi->mode = SPI_MODE_1;
+	ret = spi_setup(spi);
+	if (ret < 0) {
+		dev_err(&spi->dev, "Failed to setup spi\n");
+		return ret;
+	}
+
+	data->fw_dl_state = SSP_FW_DL_STATE_NONE;
+	data->spi = spi;
+	spi_set_drvdata(spi, data);
+
+	mutex_init(&data->comm_lock);
+
+	for (i = 0; i < SSP_SENSOR_MAX; ++i) {
+		data->delay_buf[i] = SSP_DEFAULT_POLLING_DELAY;
+		data->batch_latency_buf[i] = 0;
+		data->batch_opt_buf[i] = 0;
+		data->check_status[i] = SSP_INITIALIZATION_STATE;
+	}
+
+	data->delay_buf[SSP_BIO_HRM_LIB] = 100;
+
+	data->time_syncing = true;
+
+	mutex_init(&data->pending_lock);
+	INIT_LIST_HEAD(&data->pending_list);
+
+	atomic_set(&data->enable_refcount, 0);
+
+	INIT_WORK(&data->work_wdt, ssp_wdt_work_func);
+	INIT_DELAYED_WORK(&data->work_refresh, ssp_refresh_task);
+
+	setup_timer(&data->wdt_timer, ssp_wdt_timer_func, (unsigned long)data);
+
+	ret = request_threaded_irq(data->spi->irq, NULL,
+				   ssp_irq_thread_fn,
+				   IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+				   "SSP_Int", data);
+	if (ret < 0) {
+		dev_err(&spi->dev, "Irq request fail\n");
+		goto err_setup_irq;
+	}
+
+	/* Let's start with enabled one so irq balance could be ok */
+	data->shut_down = false;
+
+	/* just to avoid unbalanced irq set wake up */
+	enable_irq_wake(data->spi->irq);
+
+	data->fw_dl_state = ssp_check_fwbl(data);
+	if (data->fw_dl_state == SSP_FW_DL_STATE_NONE) {
+		ret = ssp_initialize_mcu(data);
+		if (ret < 0) {
+			dev_err(&spi->dev, "Initialize_mcu failed\n");
+			goto err_read_reg;
+		}
+	} else {
+		dev_err(&spi->dev, "Firmware version not supported\n");
+		ret = -EPERM;
+		goto err_read_reg;
+	}
+
+	return 0;
+
+err_read_reg:
+	free_irq(data->spi->irq, data);
+err_setup_irq:
+	mutex_destroy(&data->pending_lock);
+	mutex_destroy(&data->comm_lock);
+
+	dev_err(&spi->dev, "Probe failed!\n");
+
+	return ret;
+}
+
+static int ssp_remove(struct spi_device *spi)
+{
+	struct ssp_data *data = spi_get_drvdata(spi);
+
+	if (ssp_command(data, SSP_MSG2SSP_AP_STATUS_SHUTDOWN, 0) < 0)
+		dev_err(&data->spi->dev,
+			"SSP_MSG2SSP_AP_STATUS_SHUTDOWN failed\n");
+
+	ssp_enable_mcu(data, false);
+	ssp_disable_wdt_timer(data);
+
+	ssp_clean_pending_list(data);
+
+	free_irq(data->spi->irq, data);
+
+	del_timer_sync(&data->wdt_timer);
+	cancel_work_sync(&data->work_wdt);
+
+	mutex_destroy(&data->comm_lock);
+	mutex_destroy(&data->pending_lock);
+
+	mfd_remove_devices(&spi->dev);
+
+	return 0;
+}
+
+static int ssp_suspend(struct device *dev)
+{
+	int ret;
+	struct ssp_data *data = spi_get_drvdata(to_spi_device(dev));
+
+	data->last_resume_state = SSP_MSG2SSP_AP_STATUS_SUSPEND;
+
+	if (atomic_read(&data->enable_refcount) > 0)
+		ssp_disable_wdt_timer(data);
+
+	ret = ssp_command(data, SSP_MSG2SSP_AP_STATUS_SUSPEND, 0);
+	if (ret < 0) {
+		dev_err(&data->spi->dev,
+			"%s SSP_MSG2SSP_AP_STATUS_SUSPEND failed\n", __func__);
+
+		ssp_enable_wdt_timer(data);
+		return ret;
+	}
+
+	data->time_syncing = false;
+	disable_irq(data->spi->irq);
+
+	return 0;
+}
+
+static int ssp_resume(struct device *dev)
+{
+	int ret;
+	struct ssp_data *data = spi_get_drvdata(to_spi_device(dev));
+
+	enable_irq(data->spi->irq);
+
+	if (atomic_read(&data->enable_refcount) > 0)
+		ssp_enable_wdt_timer(data);
+
+	ret = ssp_command(data, SSP_MSG2SSP_AP_STATUS_RESUME, 0);
+	if (ret < 0) {
+		dev_err(&data->spi->dev,
+			"%s SSP_MSG2SSP_AP_STATUS_RESUME failed\n", __func__);
+		ssp_disable_wdt_timer(data);
+		return ret;
+	}
+
+	/* timesyncing is set by MCU */
+	data->last_resume_state = SSP_MSG2SSP_AP_STATUS_RESUME;
+
+	return 0;
+}
+
+static const struct dev_pm_ops ssp_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(ssp_suspend, ssp_resume)
+};
+
+static struct spi_driver ssp_driver = {
+	.probe = ssp_probe,
+	.remove = ssp_remove,
+	.driver = {
+		.pm = &ssp_pm_ops,
+		.bus = &spi_bus_type,
+		.owner = THIS_MODULE,
+		.of_match_table = of_match_ptr(ssp_of_match),
+		.name = "sensorhub"
+	},
+};
+
+module_spi_driver(ssp_driver);
+
+MODULE_DESCRIPTION("ssp sensorhub driver");
+MODULE_AUTHOR("Samsung Electronics");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iio/common/ssp_sensors/ssp_spi.c b/drivers/iio/common/ssp_sensors/ssp_spi.c
new file mode 100644
index 000000000000..704284a475ae
--- /dev/null
+++ b/drivers/iio/common/ssp_sensors/ssp_spi.c
@@ -0,0 +1,608 @@
+/*
+ *  Copyright (C) 2014, Samsung Electronics Co. Ltd. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#include "ssp.h"
+
+#define SSP_DEV (&data->spi->dev)
+#define SSP_GET_MESSAGE_TYPE(data) (data & (3 << SSP_RW))
+
+/*
+ * SSP -> AP Instruction
+ * They tell what packet type can be expected. In the future there will
+ * be less of them. BYPASS means common sensor packets with accel, gyro,
+ * hrm etc. data. LIBRARY and META are mock-up's for now.
+ */
+#define SSP_MSG2AP_INST_BYPASS_DATA		0x37
+#define SSP_MSG2AP_INST_LIBRARY_DATA		0x01
+#define SSP_MSG2AP_INST_DEBUG_DATA		0x03
+#define SSP_MSG2AP_INST_BIG_DATA		0x04
+#define SSP_MSG2AP_INST_META_DATA		0x05
+#define SSP_MSG2AP_INST_TIME_SYNC		0x06
+#define SSP_MSG2AP_INST_RESET			0x07
+
+#define SSP_UNIMPLEMENTED -1
+
+struct ssp_msg_header {
+	u8 cmd;
+	__le16 length;
+	__le16 options;
+	__le32 data;
+} __attribute__((__packed__));
+
+struct ssp_msg {
+	u16 length;
+	u16 options;
+	struct list_head list;
+	struct completion *done;
+	struct ssp_msg_header *h;
+	char *buffer;
+};
+
+static const int ssp_offset_map[SSP_SENSOR_MAX] = {
+	[SSP_ACCELEROMETER_SENSOR] =		SSP_ACCELEROMETER_SIZE +
+						SSP_TIME_SIZE,
+	[SSP_GYROSCOPE_SENSOR] =		SSP_GYROSCOPE_SIZE +
+						SSP_TIME_SIZE,
+	[SSP_GEOMAGNETIC_UNCALIB_SENSOR] =	SSP_UNIMPLEMENTED,
+	[SSP_GEOMAGNETIC_RAW] =			SSP_UNIMPLEMENTED,
+	[SSP_GEOMAGNETIC_SENSOR] =		SSP_UNIMPLEMENTED,
+	[SSP_PRESSURE_SENSOR] =			SSP_UNIMPLEMENTED,
+	[SSP_GESTURE_SENSOR] =			SSP_UNIMPLEMENTED,
+	[SSP_PROXIMITY_SENSOR] =		SSP_UNIMPLEMENTED,
+	[SSP_TEMPERATURE_HUMIDITY_SENSOR] =	SSP_UNIMPLEMENTED,
+	[SSP_LIGHT_SENSOR] =			SSP_UNIMPLEMENTED,
+	[SSP_PROXIMITY_RAW] =			SSP_UNIMPLEMENTED,
+	[SSP_ORIENTATION_SENSOR] =		SSP_UNIMPLEMENTED,
+	[SSP_STEP_DETECTOR] =			SSP_UNIMPLEMENTED,
+	[SSP_SIG_MOTION_SENSOR] =		SSP_UNIMPLEMENTED,
+	[SSP_GYRO_UNCALIB_SENSOR] =		SSP_UNIMPLEMENTED,
+	[SSP_GAME_ROTATION_VECTOR] =		SSP_UNIMPLEMENTED,
+	[SSP_ROTATION_VECTOR] =			SSP_UNIMPLEMENTED,
+	[SSP_STEP_COUNTER] =			SSP_UNIMPLEMENTED,
+	[SSP_BIO_HRM_RAW] =			SSP_BIO_HRM_RAW_SIZE +
+						SSP_TIME_SIZE,
+	[SSP_BIO_HRM_RAW_FAC] =			SSP_BIO_HRM_RAW_FAC_SIZE +
+						SSP_TIME_SIZE,
+	[SSP_BIO_HRM_LIB] =			SSP_BIO_HRM_LIB_SIZE +
+						SSP_TIME_SIZE,
+};
+
+#define SSP_HEADER_SIZE		(sizeof(struct ssp_msg_header))
+#define SSP_HEADER_SIZE_ALIGNED	(ALIGN(SSP_HEADER_SIZE, 4))
+
+static struct ssp_msg *ssp_create_msg(u8 cmd, u16 len, u16 opt, u32 data)
+{
+	struct ssp_msg_header h;
+	struct ssp_msg *msg;
+
+	msg = kzalloc(sizeof(*msg), GFP_KERNEL);
+	if (!msg)
+		return NULL;
+
+	h.cmd = cmd;
+	h.length = cpu_to_le16(len);
+	h.options = cpu_to_le16(opt);
+	h.data = cpu_to_le32(data);
+
+	msg->buffer = kzalloc(SSP_HEADER_SIZE_ALIGNED + len,
+			      GFP_KERNEL | GFP_DMA);
+	if (!msg->buffer) {
+		kfree(msg);
+		return NULL;
+	}
+
+	msg->length = len;
+	msg->options = opt;
+
+	memcpy(msg->buffer, &h, SSP_HEADER_SIZE);
+
+	return msg;
+}
+
+/*
+ * It is a bit heavy to do it this way but often the function is used to compose
+ * the message from smaller chunks which are placed on the stack.  Often the
+ * chunks are small so memcpy should be optimalized.
+ */
+static inline void ssp_fill_buffer(struct ssp_msg *m, unsigned int offset,
+				   const void *src, unsigned int len)
+{
+	memcpy(&m->buffer[SSP_HEADER_SIZE_ALIGNED + offset], src, len);
+}
+
+static inline void ssp_get_buffer(struct ssp_msg *m, unsigned int offset,
+				  void *dest, unsigned int len)
+{
+	memcpy(dest, &m->buffer[SSP_HEADER_SIZE_ALIGNED + offset],  len);
+}
+
+#define SSP_GET_BUFFER_AT_INDEX(m, index) \
+	(m->buffer[SSP_HEADER_SIZE_ALIGNED + index])
+#define SSP_SET_BUFFER_AT_INDEX(m, index, val) \
+	(m->buffer[SSP_HEADER_SIZE_ALIGNED + index] = val)
+
+static void ssp_clean_msg(struct ssp_msg *m)
+{
+	kfree(m->buffer);
+	kfree(m);
+}
+
+static int ssp_print_mcu_debug(char *data_frame, int *data_index,
+			       int received_len)
+{
+	int length = data_frame[(*data_index)++];
+
+	if (length > received_len - *data_index || length <= 0) {
+		ssp_dbg("[SSP]: MSG From MCU-invalid debug length(%d/%d)\n",
+			length, received_len);
+		return length ? length : -EPROTO;
+	}
+
+	ssp_dbg("[SSP]: MSG From MCU - %s\n", &data_frame[*data_index]);
+
+	*data_index += length;
+
+	return 0;
+}
+
+/*
+ * It was designed that way - additional lines to some kind of handshake,
+ * please do not ask why - only the firmware guy can know it.
+ */
+static int ssp_check_lines(struct ssp_data *data, bool state)
+{
+	int delay_cnt = 0;
+
+	gpio_set_value_cansleep(data->ap_mcu_gpio, state);
+
+	while (gpio_get_value_cansleep(data->mcu_ap_gpio) != state) {
+		usleep_range(3000, 3500);
+
+		if (data->shut_down || delay_cnt++ > 500) {
+			dev_err(SSP_DEV, "%s:timeout, hw ack wait fail %d\n",
+				__func__, state);
+
+			if (!state)
+				gpio_set_value_cansleep(data->ap_mcu_gpio, 1);
+
+			return -ETIMEDOUT;
+		}
+	}
+
+	return 0;
+}
+
+static int ssp_do_transfer(struct ssp_data *data, struct ssp_msg *msg,
+			   struct completion *done, int timeout)
+{
+	int status;
+	/*
+	 * check if this is a short one way message or the whole transfer has
+	 * second part after an interrupt
+	 */
+	const bool use_no_irq = msg->length == 0;
+
+	if (data->shut_down)
+		return -EPERM;
+
+	msg->done = done;
+
+	mutex_lock(&data->comm_lock);
+
+	status = ssp_check_lines(data, false);
+	if (status < 0)
+		goto _error_locked;
+
+	status = spi_write(data->spi, msg->buffer, SSP_HEADER_SIZE);
+	if (status < 0) {
+		gpio_set_value_cansleep(data->ap_mcu_gpio, 1);
+		dev_err(SSP_DEV, "%s spi_write fail\n", __func__);
+		goto _error_locked;
+	}
+
+	if (!use_no_irq) {
+		mutex_lock(&data->pending_lock);
+		list_add_tail(&msg->list, &data->pending_list);
+		mutex_unlock(&data->pending_lock);
+	}
+
+	status = ssp_check_lines(data, true);
+	if (status < 0) {
+		if (!use_no_irq) {
+			mutex_lock(&data->pending_lock);
+			list_del(&msg->list);
+			mutex_unlock(&data->pending_lock);
+		}
+		goto _error_locked;
+	}
+
+	mutex_unlock(&data->comm_lock);
+
+	if (!use_no_irq && done)
+		if (wait_for_completion_timeout(done,
+						msecs_to_jiffies(timeout)) ==
+		    0) {
+			mutex_lock(&data->pending_lock);
+			list_del(&msg->list);
+			mutex_unlock(&data->pending_lock);
+
+			data->timeout_cnt++;
+			return -ETIMEDOUT;
+		}
+
+	return 0;
+
+_error_locked:
+	mutex_unlock(&data->comm_lock);
+	data->timeout_cnt++;
+	return status;
+}
+
+static inline int ssp_spi_sync_command(struct ssp_data *data,
+				       struct ssp_msg *msg)
+{
+	return ssp_do_transfer(data, msg, NULL, 0);
+}
+
+static int ssp_spi_sync(struct ssp_data *data, struct ssp_msg *msg,
+			int timeout)
+{
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	if (WARN_ON(!msg->length))
+		return -EPERM;
+
+	return ssp_do_transfer(data, msg, &done, timeout);
+}
+
+static int ssp_handle_big_data(struct ssp_data *data, char *dataframe, int *idx)
+{
+	/* mock-up, it will be changed with adding another sensor types */
+	*idx += 8;
+	return 0;
+}
+
+static int ssp_parse_dataframe(struct ssp_data *data, char *dataframe, int len)
+{
+	int idx, sd;
+	struct timespec ts;
+	struct ssp_sensor_data *spd;
+	struct iio_dev **indio_devs = data->sensor_devs;
+
+	getnstimeofday(&ts);
+
+	for (idx = 0; idx < len;) {
+		switch (dataframe[idx++]) {
+		case SSP_MSG2AP_INST_BYPASS_DATA:
+			sd = dataframe[idx++];
+			if (sd < 0 || sd >= SSP_SENSOR_MAX) {
+				dev_err(SSP_DEV,
+					"Mcu data frame1 error %d\n", sd);
+				return -EPROTO;
+			}
+
+			if (indio_devs[sd]) {
+				spd = iio_priv(indio_devs[sd]);
+				if (spd->process_data)
+					spd->process_data(indio_devs[sd],
+							  &dataframe[idx],
+							  data->timestamp);
+			} else {
+				dev_err(SSP_DEV, "no client for frame\n");
+			}
+
+			idx += ssp_offset_map[sd];
+			break;
+		case SSP_MSG2AP_INST_DEBUG_DATA:
+			sd = ssp_print_mcu_debug(dataframe, &idx, len);
+			if (sd) {
+				dev_err(SSP_DEV,
+					"Mcu data frame3 error %d\n", sd);
+				return sd;
+			}
+			break;
+		case SSP_MSG2AP_INST_LIBRARY_DATA:
+			idx += len;
+			break;
+		case SSP_MSG2AP_INST_BIG_DATA:
+			ssp_handle_big_data(data, dataframe, &idx);
+			break;
+		case SSP_MSG2AP_INST_TIME_SYNC:
+			data->time_syncing = true;
+			break;
+		case SSP_MSG2AP_INST_RESET:
+			ssp_queue_ssp_refresh_task(data, 0);
+			break;
+		}
+	}
+
+	if (data->time_syncing)
+		data->timestamp = ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+
+	return 0;
+}
+
+/* threaded irq */
+int ssp_irq_msg(struct ssp_data *data)
+{
+	bool found = false;
+	char *buffer;
+	u8 msg_type;
+	int ret;
+	u16 length, msg_options;
+	struct ssp_msg *msg, *n;
+
+	ret = spi_read(data->spi, data->header_buffer, SSP_HEADER_BUFFER_SIZE);
+	if (ret < 0) {
+		dev_err(SSP_DEV, "header read fail\n");
+		return ret;
+	}
+
+	length = le16_to_cpu(data->header_buffer[1]);
+	msg_options = le16_to_cpu(data->header_buffer[0]);
+
+	if (length == 0) {
+		dev_err(SSP_DEV, "length received from mcu is 0\n");
+		return -EINVAL;
+	}
+
+	msg_type = SSP_GET_MESSAGE_TYPE(msg_options);
+
+	switch (msg_type) {
+	case SSP_AP2HUB_READ:
+	case SSP_AP2HUB_WRITE:
+		/*
+		 * this is a small list, a few elements - the packets can be
+		 * received with no order
+		 */
+		mutex_lock(&data->pending_lock);
+		list_for_each_entry_safe(msg, n, &data->pending_list, list) {
+			if (msg->options == msg_options) {
+				list_del(&msg->list);
+				found = true;
+				break;
+			}
+		}
+
+		if (!found) {
+			/*
+			 * here can be implemented dead messages handling
+			 * but the slave should not send such ones - it is to
+			 * check but let's handle this
+			 */
+			buffer = kmalloc(length, GFP_KERNEL | GFP_DMA);
+			if (!buffer) {
+				ret = -ENOMEM;
+				goto _unlock;
+			}
+
+			/* got dead packet so it is always an error */
+			ret = spi_read(data->spi, buffer, length);
+			if (ret >= 0)
+				ret = -EPROTO;
+
+			kfree(buffer);
+
+			dev_err(SSP_DEV, "No match error %x\n",
+				msg_options);
+
+			goto _unlock;
+		}
+
+		if (msg_type == SSP_AP2HUB_READ)
+			ret = spi_read(data->spi,
+				       &msg->buffer[SSP_HEADER_SIZE_ALIGNED],
+				       msg->length);
+
+		if (msg_type == SSP_AP2HUB_WRITE) {
+			ret = spi_write(data->spi,
+					&msg->buffer[SSP_HEADER_SIZE_ALIGNED],
+					msg->length);
+			if (msg_options & SSP_AP2HUB_RETURN) {
+				msg->options =
+					SSP_AP2HUB_READ | SSP_AP2HUB_RETURN;
+				msg->length = 1;
+
+				list_add_tail(&msg->list, &data->pending_list);
+				goto _unlock;
+			}
+		}
+
+		if (msg->done)
+			if (!completion_done(msg->done))
+				complete(msg->done);
+_unlock:
+		mutex_unlock(&data->pending_lock);
+		break;
+	case SSP_HUB2AP_WRITE:
+		buffer = kzalloc(length, GFP_KERNEL | GFP_DMA);
+		if (!buffer)
+			return -ENOMEM;
+
+		ret = spi_read(data->spi, buffer, length);
+		if (ret < 0) {
+			dev_err(SSP_DEV, "spi read fail\n");
+			kfree(buffer);
+			break;
+		}
+
+		ret = ssp_parse_dataframe(data, buffer, length);
+
+		kfree(buffer);
+		break;
+
+	default:
+		dev_err(SSP_DEV, "unknown msg type\n");
+		return -EPROTO;
+	}
+
+	return ret;
+}
+
+void ssp_clean_pending_list(struct ssp_data *data)
+{
+	struct ssp_msg *msg, *n;
+
+	mutex_lock(&data->pending_lock);
+	list_for_each_entry_safe(msg, n, &data->pending_list, list) {
+		list_del(&msg->list);
+
+		if (msg->done)
+			if (!completion_done(msg->done))
+				complete(msg->done);
+	}
+	mutex_unlock(&data->pending_lock);
+}
+
+int ssp_command(struct ssp_data *data, char command, int arg)
+{
+	int ret;
+	struct ssp_msg *msg;
+
+	msg = ssp_create_msg(command, 0, SSP_AP2HUB_WRITE, arg);
+	if (!msg)
+		return -ENOMEM;
+
+	ssp_dbg("%s - command 0x%x %d\n", __func__, command, arg);
+
+	ret = ssp_spi_sync_command(data, msg);
+	ssp_clean_msg(msg);
+
+	return ret;
+}
+
+int ssp_send_instruction(struct ssp_data *data, u8 inst, u8 sensor_type,
+			 u8 *send_buf, u8 length)
+{
+	int ret;
+	struct ssp_msg *msg;
+
+	if (data->fw_dl_state == SSP_FW_DL_STATE_DOWNLOADING) {
+		dev_err(SSP_DEV, "%s - Skip Inst! DL state = %d\n",
+			__func__, data->fw_dl_state);
+		return -EBUSY;
+	} else if (!(data->available_sensors & BIT(sensor_type)) &&
+		   (inst <= SSP_MSG2SSP_INST_CHANGE_DELAY)) {
+		dev_err(SSP_DEV, "%s - Bypass Inst Skip! - %u\n",
+			__func__, sensor_type);
+		return -EIO; /* just fail */
+	}
+
+	msg = ssp_create_msg(inst, length + 2, SSP_AP2HUB_WRITE, 0);
+	if (!msg)
+		return -ENOMEM;
+
+	ssp_fill_buffer(msg, 0, &sensor_type, 1);
+	ssp_fill_buffer(msg, 1, send_buf, length);
+
+	ssp_dbg("%s - Inst = 0x%x, Sensor Type = 0x%x, data = %u\n",
+		__func__, inst, sensor_type, send_buf[1]);
+
+	ret = ssp_spi_sync(data, msg, 1000);
+	ssp_clean_msg(msg);
+
+	return ret;
+}
+
+int ssp_get_chipid(struct ssp_data *data)
+{
+	int ret;
+	char buffer;
+	struct ssp_msg *msg;
+
+	msg = ssp_create_msg(SSP_MSG2SSP_AP_WHOAMI, 1, SSP_AP2HUB_READ, 0);
+	if (!msg)
+		return -ENOMEM;
+
+	ret = ssp_spi_sync(data, msg, 1000);
+
+	buffer = SSP_GET_BUFFER_AT_INDEX(msg, 0);
+
+	ssp_clean_msg(msg);
+
+	return ret < 0 ? ret : buffer;
+}
+
+int ssp_set_magnetic_matrix(struct ssp_data *data)
+{
+	int ret;
+	struct ssp_msg *msg;
+
+	msg = ssp_create_msg(SSP_MSG2SSP_AP_SET_MAGNETIC_STATIC_MATRIX,
+			     data->sensorhub_info->mag_length, SSP_AP2HUB_WRITE,
+			     0);
+	if (!msg)
+		return -ENOMEM;
+
+	ssp_fill_buffer(msg, 0, data->sensorhub_info->mag_table,
+			data->sensorhub_info->mag_length);
+
+	ret = ssp_spi_sync(data, msg, 1000);
+	ssp_clean_msg(msg);
+
+	return ret;
+}
+
+unsigned int ssp_get_sensor_scanning_info(struct ssp_data *data)
+{
+	int ret;
+	__le32 result;
+	u32 cpu_result = 0;
+
+	struct ssp_msg *msg = ssp_create_msg(SSP_MSG2SSP_AP_SENSOR_SCANNING, 4,
+					     SSP_AP2HUB_READ, 0);
+	if (!msg)
+		return 0;
+
+	ret = ssp_spi_sync(data, msg, 1000);
+	if (ret < 0) {
+		dev_err(SSP_DEV, "%s - spi read fail %d\n", __func__, ret);
+		goto _exit;
+	}
+
+	ssp_get_buffer(msg, 0, &result, 4);
+	cpu_result = le32_to_cpu(result);
+
+	dev_info(SSP_DEV, "%s state: 0x%08x\n", __func__, cpu_result);
+
+_exit:
+	ssp_clean_msg(msg);
+	return cpu_result;
+}
+
+unsigned int ssp_get_firmware_rev(struct ssp_data *data)
+{
+	int ret;
+	__le32 result;
+
+	struct ssp_msg *msg = ssp_create_msg(SSP_MSG2SSP_AP_FIRMWARE_REV, 4,
+					     SSP_AP2HUB_READ, 0);
+	if (!msg)
+		return SSP_INVALID_REVISION;
+
+	ret = ssp_spi_sync(data, msg, 1000);
+	if (ret < 0) {
+		dev_err(SSP_DEV, "%s - transfer fail %d\n", __func__, ret);
+		ret = SSP_INVALID_REVISION;
+		goto _exit;
+	}
+
+	ssp_get_buffer(msg, 0, &result, 4);
+	ret = le32_to_cpu(result);
+
+_exit:
+	ssp_clean_msg(msg);
+	return ret;
+}
diff --git a/include/linux/iio/common/ssp_sensors.h b/include/linux/iio/common/ssp_sensors.h
new file mode 100644
index 000000000000..f4d1b0edb432
--- /dev/null
+++ b/include/linux/iio/common/ssp_sensors.h
@@ -0,0 +1,82 @@
+/*
+ *  Copyright (C) 2014, Samsung Electronics Co. Ltd. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ */
+#ifndef _SSP_SENSORS_H_
+#define _SSP_SENSORS_H_
+
+#include <linux/iio/iio.h>
+
+#define SSP_TIME_SIZE				4
+#define SSP_ACCELEROMETER_SIZE			6
+#define SSP_GYROSCOPE_SIZE			6
+#define SSP_BIO_HRM_RAW_SIZE			8
+#define SSP_BIO_HRM_RAW_FAC_SIZE		36
+#define SSP_BIO_HRM_LIB_SIZE			8
+
+/**
+ * enum ssp_sensor_type - SSP sensor type
+ */
+enum ssp_sensor_type {
+	SSP_ACCELEROMETER_SENSOR = 0,
+	SSP_GYROSCOPE_SENSOR,
+	SSP_GEOMAGNETIC_UNCALIB_SENSOR,
+	SSP_GEOMAGNETIC_RAW,
+	SSP_GEOMAGNETIC_SENSOR,
+	SSP_PRESSURE_SENSOR,
+	SSP_GESTURE_SENSOR,
+	SSP_PROXIMITY_SENSOR,
+	SSP_TEMPERATURE_HUMIDITY_SENSOR,
+	SSP_LIGHT_SENSOR,
+	SSP_PROXIMITY_RAW,
+	SSP_ORIENTATION_SENSOR,
+	SSP_STEP_DETECTOR,
+	SSP_SIG_MOTION_SENSOR,
+	SSP_GYRO_UNCALIB_SENSOR,
+	SSP_GAME_ROTATION_VECTOR,
+	SSP_ROTATION_VECTOR,
+	SSP_STEP_COUNTER,
+	SSP_BIO_HRM_RAW,
+	SSP_BIO_HRM_RAW_FAC,
+	SSP_BIO_HRM_LIB,
+	SSP_SENSOR_MAX,
+};
+
+struct ssp_data;
+
+/**
+ * struct ssp_sensor_data - Sensor object
+ * @process_data:	Callback to feed sensor data.
+ * @type:		Used sensor type.
+ * @buffer:		Received data buffer.
+ */
+struct ssp_sensor_data {
+	int (*process_data)(struct iio_dev *indio_dev, void *buf,
+			    int64_t timestamp);
+	enum ssp_sensor_type type;
+	u8 *buffer;
+};
+
+void ssp_register_consumer(struct iio_dev *indio_dev,
+			   enum ssp_sensor_type type);
+
+int ssp_enable_sensor(struct ssp_data *data, enum ssp_sensor_type type,
+		      u32 delay);
+
+int ssp_disable_sensor(struct ssp_data *data, enum ssp_sensor_type type);
+
+u32 ssp_get_sensor_delay(struct ssp_data *data, enum ssp_sensor_type);
+
+int ssp_change_delay(struct ssp_data *data, enum ssp_sensor_type type,
+		     u32 delay);
+#endif /* _SSP_SENSORS_H_ */
-- 
cgit v1.2.3


From 2f0ecb7c6563d711bec15268d56adf1c630e77d1 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Tue, 27 Jan 2015 20:41:52 +0200
Subject: iio: core: Introduce IIO_CHAN_INFO_DEBOUNCE_COUNT and _TIME

The pedometer needs to filter out false steps that might be generated by
tapping the foot, sitting, etc. To do that it computes the number of
steps that occur in a given time and decides the user is moving only
if this value is over a threshold. E.g.: the user starts moving only
if he takes 4 steps in 3 seconds. This filter is applied only when
the user starts moving.

A device that has such pedometer functionality is Freescale's MMA9553L:
http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf.

To export this feature, this patch introduces IIO_CHAN_INFO_DEBOUNCE_COUNT
and IIO_CHAN_INFO_DEBOUNCE_TIME. For the pedometer, in_steps_debounce_count
will specify the number of steps that need to occur in
in_steps_debounce_time seconds so that the pedometer decides the user is
moving.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 15 +++++++++++++++
 drivers/iio/industrialio-core.c         |  2 ++
 include/linux/iio/iio.h                 |  2 ++
 3 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index c03a1401b9ca..b4ea9c521f69 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1193,3 +1193,18 @@ Description:
 		This attribute is used to read the current speed value of the
 		user (which is the norm or magnitude of the velocity vector).
 		Units after application of scale are m/s.
+
+What:		/sys/.../iio:deviceX/in_steps_debounce_count
+KernelVersion:	3.20
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Specifies the number of steps that must occur within
+		in_steps_filter_debounce_time for the pedometer to decide the
+		consumer is making steps.
+
+What:		/sys/.../iio:deviceX/in_steps_debounce_time
+KernelVersion:	3.20
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Specifies number of seconds in which we compute the steps
+		that occur in order to decide if the consumer is making steps.
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 4ee6fdfa92fe..aaba9d3d980e 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -126,6 +126,8 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_ENABLE] = "en",
 	[IIO_CHAN_INFO_CALIBHEIGHT] = "calibheight",
 	[IIO_CHAN_INFO_CALIBWEIGHT] = "calibweight",
+	[IIO_CHAN_INFO_DEBOUNCE_COUNT] = "debounce_count",
+	[IIO_CHAN_INFO_DEBOUNCE_TIME] = "debounce_time",
 };
 
 /**
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 51f16437dacc..80d855061064 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -41,6 +41,8 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_ENABLE,
 	IIO_CHAN_INFO_CALIBHEIGHT,
 	IIO_CHAN_INFO_CALIBWEIGHT,
+	IIO_CHAN_INFO_DEBOUNCE_COUNT,
+	IIO_CHAN_INFO_DEBOUNCE_TIME,
 };
 
 enum iio_shared_by {
-- 
cgit v1.2.3


From 712eddf70225ab5ae65e946e22d2dfe6b93e8dd1 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Sat, 24 Jan 2015 14:05:50 +0900
Subject: cpuidle: exynos: add coupled cpuidle support for exynos4210

The following patch adds coupled cpuidle support for Exynos4210 to
an existing cpuidle-exynos driver.  As a result it enables AFTR mode
to be used by default on Exynos4210 without the need to hot unplug
CPU1 first.

The patch is heavily based on earlier cpuidle-exynos4210 driver from
Daniel Lezcano:

http://www.spinics.net/lists/linux-samsung-soc/msg28134.html

Changes from Daniel's code include:
- porting code to current kernels
- fixing it to work on my setup (by using S5P_INFORM register
  instead of S5P_VA_SYSRAM one on Revison 1.1 and retrying poking
  CPU1 out of the BOOT ROM if necessary)
- fixing rare lockup caused by waiting for CPU1 to get stuck in
  the BOOT ROM (CPU hotplug code in arch/arm/mach-exynos/platsmp.c
  doesn't require this and works fine)
- moving Exynos specific code to arch/arm/mach-exynos/pm.c
- using cpu_boot_reg_base() helper instead of BOOT_VECTOR macro
- using exynos_cpu_*() helpers instead of accessing registers
  directly
- using arch_send_wakeup_ipi_mask() instead of dsb_sev()
  (this matches CPU hotplug code in arch/arm/mach-exynos/platsmp.c)
- integrating separate exynos4210-cpuidle driver into existing
  exynos-cpuidle one

Cc: Colin Cross <ccross@google.com>
Cc: Kukjin Kim <kgene.kim@samsung.com>
Cc: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Cc: Tomasz Figa <tomasz.figa@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Kukjin Kim <kgene@kernel.org>
---
 arch/arm/mach-exynos/common.h                |   4 +
 arch/arm/mach-exynos/exynos.c                |   4 +
 arch/arm/mach-exynos/platsmp.c               |   2 +-
 arch/arm/mach-exynos/pm.c                    | 122 +++++++++++++++++++++++++++
 drivers/cpuidle/Kconfig.arm                  |   1 +
 drivers/cpuidle/cpuidle-exynos.c             |  76 +++++++++++++++--
 include/linux/platform_data/cpuidle-exynos.h |  20 +++++
 7 files changed, 223 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/platform_data/cpuidle-exynos.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-exynos/common.h b/arch/arm/mach-exynos/common.h
index 865f878063cc..f70eca7ee705 100644
--- a/arch/arm/mach-exynos/common.h
+++ b/arch/arm/mach-exynos/common.h
@@ -13,6 +13,7 @@
 #define __ARCH_ARM_MACH_EXYNOS_COMMON_H
 
 #include <linux/of.h>
+#include <linux/platform_data/cpuidle-exynos.h>
 
 #define EXYNOS3250_SOC_ID	0xE3472000
 #define EXYNOS3_SOC_MASK	0xFFFFF000
@@ -150,8 +151,11 @@ extern void exynos_pm_central_suspend(void);
 extern int exynos_pm_central_resume(void);
 extern void exynos_enter_aftr(void);
 
+extern struct cpuidle_exynos_data cpuidle_coupled_exynos_data;
+
 extern void s5p_init_cpu(void __iomem *cpuid_addr);
 extern unsigned int samsung_rev(void);
+extern void __iomem *cpu_boot_reg_base(void);
 
 static inline void pmu_raw_writel(u32 val, u32 offset)
 {
diff --git a/arch/arm/mach-exynos/exynos.c b/arch/arm/mach-exynos/exynos.c
index c13d0837fa8c..509f2e5a2d70 100644
--- a/arch/arm/mach-exynos/exynos.c
+++ b/arch/arm/mach-exynos/exynos.c
@@ -246,6 +246,10 @@ static void __init exynos_dt_machine_init(void)
 	if (!IS_ENABLED(CONFIG_SMP))
 		exynos_sysram_init();
 
+#ifdef CONFIG_ARM_EXYNOS_CPUIDLE
+	if (of_machine_is_compatible("samsung,exynos4210"))
+		exynos_cpuidle.dev.platform_data = &cpuidle_coupled_exynos_data;
+#endif
 	if (of_machine_is_compatible("samsung,exynos4210") ||
 	    of_machine_is_compatible("samsung,exynos4212") ||
 	    (of_machine_is_compatible("samsung,exynos4412") &&
diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
index 7a1ebfeeeeb8..3f32c47a6d74 100644
--- a/arch/arm/mach-exynos/platsmp.c
+++ b/arch/arm/mach-exynos/platsmp.c
@@ -194,7 +194,7 @@ int exynos_cluster_power_state(int cluster)
 		S5P_CORE_LOCAL_PWR_EN);
 }
 
-static inline void __iomem *cpu_boot_reg_base(void)
+void __iomem *cpu_boot_reg_base(void)
 {
 	if (soc_is_exynos4210() && samsung_rev() == EXYNOS4210_REV_1_1)
 		return pmu_base_addr + S5P_INFORM5;
diff --git a/arch/arm/mach-exynos/pm.c b/arch/arm/mach-exynos/pm.c
index 159eb4c9779e..1f9cbd434c60 100644
--- a/arch/arm/mach-exynos/pm.c
+++ b/arch/arm/mach-exynos/pm.c
@@ -179,3 +179,125 @@ void exynos_enter_aftr(void)
 
 	cpu_pm_exit();
 }
+
+static atomic_t cpu1_wakeup = ATOMIC_INIT(0);
+
+static int exynos_cpu0_enter_aftr(void)
+{
+	int ret = -1;
+
+	/*
+	 * If the other cpu is powered on, we have to power it off, because
+	 * the AFTR state won't work otherwise
+	 */
+	if (cpu_online(1)) {
+		/*
+		 * We reach a sync point with the coupled idle state, we know
+		 * the other cpu will power down itself or will abort the
+		 * sequence, let's wait for one of these to happen
+		 */
+		while (exynos_cpu_power_state(1)) {
+			/*
+			 * The other cpu may skip idle and boot back
+			 * up again
+			 */
+			if (atomic_read(&cpu1_wakeup))
+				goto abort;
+
+			/*
+			 * The other cpu may bounce through idle and
+			 * boot back up again, getting stuck in the
+			 * boot rom code
+			 */
+			if (__raw_readl(cpu_boot_reg_base()) == 0)
+				goto abort;
+
+			cpu_relax();
+		}
+	}
+
+	exynos_enter_aftr();
+	ret = 0;
+
+abort:
+	if (cpu_online(1)) {
+		/*
+		 * Set the boot vector to something non-zero
+		 */
+		__raw_writel(virt_to_phys(exynos_cpu_resume),
+			     cpu_boot_reg_base());
+		dsb();
+
+		/*
+		 * Turn on cpu1 and wait for it to be on
+		 */
+		exynos_cpu_power_up(1);
+		while (exynos_cpu_power_state(1) != S5P_CORE_LOCAL_PWR_EN)
+			cpu_relax();
+
+		while (!atomic_read(&cpu1_wakeup)) {
+			/*
+			 * Poke cpu1 out of the boot rom
+			 */
+			__raw_writel(virt_to_phys(exynos_cpu_resume),
+				     cpu_boot_reg_base());
+
+			arch_send_wakeup_ipi_mask(cpumask_of(1));
+		}
+	}
+
+	return ret;
+}
+
+static int exynos_wfi_finisher(unsigned long flags)
+{
+	cpu_do_idle();
+
+	return -1;
+}
+
+static int exynos_cpu1_powerdown(void)
+{
+	int ret = -1;
+
+	/*
+	 * Idle sequence for cpu1
+	 */
+	if (cpu_pm_enter())
+		goto cpu1_aborted;
+
+	/*
+	 * Turn off cpu 1
+	 */
+	exynos_cpu_power_down(1);
+
+	ret = cpu_suspend(0, exynos_wfi_finisher);
+
+	cpu_pm_exit();
+
+cpu1_aborted:
+	dsb();
+	/*
+	 * Notify cpu 0 that cpu 1 is awake
+	 */
+	atomic_set(&cpu1_wakeup, 1);
+
+	return ret;
+}
+
+static void exynos_pre_enter_aftr(void)
+{
+	__raw_writel(virt_to_phys(exynos_cpu_resume), cpu_boot_reg_base());
+}
+
+static void exynos_post_enter_aftr(void)
+{
+	atomic_set(&cpu1_wakeup, 0);
+}
+
+struct cpuidle_exynos_data cpuidle_coupled_exynos_data = {
+	.cpu0_enter_aftr		= exynos_cpu0_enter_aftr,
+	.cpu1_powerdown		= exynos_cpu1_powerdown,
+	.pre_enter_aftr		= exynos_pre_enter_aftr,
+	.post_enter_aftr		= exynos_post_enter_aftr,
+};
diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm
index 8c16ab20fb15..8e07c9419153 100644
--- a/drivers/cpuidle/Kconfig.arm
+++ b/drivers/cpuidle/Kconfig.arm
@@ -55,6 +55,7 @@ config ARM_AT91_CPUIDLE
 config ARM_EXYNOS_CPUIDLE
 	bool "Cpu Idle Driver for the Exynos processors"
 	depends on ARCH_EXYNOS
+	select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
 	help
 	  Select this to enable cpuidle for Exynos processors
 
diff --git a/drivers/cpuidle/cpuidle-exynos.c b/drivers/cpuidle/cpuidle-exynos.c
index 4003a3160865..26f5f29fdb03 100644
--- a/drivers/cpuidle/cpuidle-exynos.c
+++ b/drivers/cpuidle/cpuidle-exynos.c
@@ -1,8 +1,11 @@
-/* linux/arch/arm/mach-exynos/cpuidle.c
- *
- * Copyright (c) 2011 Samsung Electronics Co., Ltd.
+/*
+ * Copyright (c) 2011-2014 Samsung Electronics Co., Ltd.
  *		http://www.samsung.com
  *
+ * Coupled cpuidle support based on the work of:
+ *	Colin Cross <ccross@android.com>
+ *	Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -13,13 +16,49 @@
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/of.h>
+#include <linux/platform_data/cpuidle-exynos.h>
 
 #include <asm/proc-fns.h>
 #include <asm/suspend.h>
 #include <asm/cpuidle.h>
 
+static atomic_t exynos_idle_barrier;
+
+static struct cpuidle_exynos_data *exynos_cpuidle_pdata;
 static void (*exynos_enter_aftr)(void);
 
+static int exynos_enter_coupled_lowpower(struct cpuidle_device *dev,
+					 struct cpuidle_driver *drv,
+					 int index)
+{
+	int ret;
+
+	exynos_cpuidle_pdata->pre_enter_aftr();
+
+	/*
+	 * Waiting all cpus to reach this point at the same moment
+	 */
+	cpuidle_coupled_parallel_barrier(dev, &exynos_idle_barrier);
+
+	/*
+	 * Both cpus will reach this point at the same time
+	 */
+	ret = dev->cpu ? exynos_cpuidle_pdata->cpu1_powerdown()
+		       : exynos_cpuidle_pdata->cpu0_enter_aftr();
+	if (ret)
+		index = ret;
+
+	/*
+	 * Waiting all cpus to finish the power sequence before going further
+	 */
+	cpuidle_coupled_parallel_barrier(dev, &exynos_idle_barrier);
+
+	exynos_cpuidle_pdata->post_enter_aftr();
+
+	return index;
+}
+
 static int exynos_enter_lowpower(struct cpuidle_device *dev,
 				struct cpuidle_driver *drv,
 				int index)
@@ -55,13 +94,40 @@ static struct cpuidle_driver exynos_idle_driver = {
 	.safe_state_index = 0,
 };
 
+static struct cpuidle_driver exynos_coupled_idle_driver = {
+	.name			= "exynos_coupled_idle",
+	.owner			= THIS_MODULE,
+	.states = {
+		[0] = ARM_CPUIDLE_WFI_STATE,
+		[1] = {
+			.enter			= exynos_enter_coupled_lowpower,
+			.exit_latency		= 5000,
+			.target_residency	= 10000,
+			.flags			= CPUIDLE_FLAG_COUPLED |
+						  CPUIDLE_FLAG_TIMER_STOP,
+			.name			= "C1",
+			.desc			= "ARM power down",
+		},
+	},
+	.state_count = 2,
+	.safe_state_index = 0,
+};
+
 static int exynos_cpuidle_probe(struct platform_device *pdev)
 {
 	int ret;
 
-	exynos_enter_aftr = (void *)(pdev->dev.platform_data);
+	if (of_machine_is_compatible("samsung,exynos4210")) {
+		exynos_cpuidle_pdata = pdev->dev.platform_data;
+
+		ret = cpuidle_register(&exynos_coupled_idle_driver,
+				       cpu_possible_mask);
+	} else {
+		exynos_enter_aftr = (void *)(pdev->dev.platform_data);
+
+		ret = cpuidle_register(&exynos_idle_driver, NULL);
+	}
 
-	ret = cpuidle_register(&exynos_idle_driver, NULL);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to register cpuidle driver\n");
 		return ret;
diff --git a/include/linux/platform_data/cpuidle-exynos.h b/include/linux/platform_data/cpuidle-exynos.h
new file mode 100644
index 000000000000..bfa40e4c5d5f
--- /dev/null
+++ b/include/linux/platform_data/cpuidle-exynos.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ *              http://www.samsung.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#ifndef __CPUIDLE_EXYNOS_H
+#define __CPUIDLE_EXYNOS_H
+
+struct cpuidle_exynos_data {
+	int (*cpu0_enter_aftr)(void);
+	int (*cpu1_powerdown)(void);
+	void (*pre_enter_aftr)(void);
+	void (*post_enter_aftr)(void);
+};
+
+#endif
-- 
cgit v1.2.3


From e4b3d38088df6f3acd40259c8ec32c9bd3bfe3da Mon Sep 17 00:00:00 2001
From: Sylwester Nawrocki <s.nawrocki@samsung.com>
Date: Fri, 16 Jan 2015 18:30:37 +0100
Subject: phy: exynos-video-mipi: Fix regression by adding support for PMU
 regmap

After the Exynos Power Management Unit (PMU) driver was converted
to the platform device driver in commit 14fc8b93d47323561edf5d482
("ARM: EXYNOS: Add platform driver support for Exynos PMU") and
then PMU device nodes added to Exynos4 DTs in commit
7b9613aca42a5522d269 ("ARM: dts: add PMU syscon node for exynos4")
the mipi video phy driver started failing probing, due to overlapping
memory mapped register region resources.

Now all the Exynos peripheral devices which have registers in the PMU
region are supposed to use the regmap provided by the syscon driver.
So support for regmap is added in this patch, this unfortunately
creates yet another indirection into that supposedly trivial driver.

The additional mutex is required because single register is used by
PHY pairs (they share bit in a register). An improvement here could
be to allow a PHY instance be created with a driver custom mutex,
which would then be common for each PHY pair. This would eliminate
one of 3 mutexes which need to be taken in the phy_power_on/
phy_power_off code path. However, I tried to keep this bug fix patch
possibly simple.

This change is needed to make MIPI DSI displays and MIPI CSI-2
camera sensors working again on Exynos4 boards.

Cc: Pankaj Dubey <pankaj.dubey@samsung.com>
Cc: Kukjin Kim <kgene.kim@samsung.com>
Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 .../devicetree/bindings/phy/samsung-phy.txt        |  2 +-
 drivers/phy/phy-exynos-mipi-video.c                | 89 ++++++++++++++--------
 include/linux/mfd/syscon/exynos4-pmu.h             | 21 +++++
 3 files changed, 79 insertions(+), 33 deletions(-)
 create mode 100644 include/linux/mfd/syscon/exynos4-pmu.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/phy/samsung-phy.txt b/Documentation/devicetree/bindings/phy/samsung-phy.txt
index d5bad920827f..91e38cfe1f8f 100644
--- a/Documentation/devicetree/bindings/phy/samsung-phy.txt
+++ b/Documentation/devicetree/bindings/phy/samsung-phy.txt
@@ -3,8 +3,8 @@ Samsung S5P/EXYNOS SoC series MIPI CSIS/DSIM DPHY
 
 Required properties:
 - compatible : should be "samsung,s5pv210-mipi-video-phy";
-- reg : offset and length of the MIPI DPHY register set;
 - #phy-cells : from the generic phy bindings, must be 1;
+- syscon - phandle to the PMU system controller;
 
 For "samsung,s5pv210-mipi-video-phy" compatible PHYs the second cell in
 the PHY specifier identifies the PHY and its meaning is as follows:
diff --git a/drivers/phy/phy-exynos-mipi-video.c b/drivers/phy/phy-exynos-mipi-video.c
index 943e0f88a120..f017b2f2a54e 100644
--- a/drivers/phy/phy-exynos-mipi-video.c
+++ b/drivers/phy/phy-exynos-mipi-video.c
@@ -12,19 +12,18 @@
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
+#include <linux/mfd/syscon/exynos4-pmu.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/phy/phy.h>
 #include <linux/platform_device.h>
+#include <linux/regmap.h>
 #include <linux/spinlock.h>
+#include <linux/mfd/syscon.h>
 
-/* MIPI_PHYn_CONTROL register offset: n = 0..1 */
+/* MIPI_PHYn_CONTROL reg. offset (for base address from ioremap): n = 0..1 */
 #define EXYNOS_MIPI_PHY_CONTROL(n)	((n) * 4)
-#define EXYNOS_MIPI_PHY_ENABLE		(1 << 0)
-#define EXYNOS_MIPI_PHY_SRESETN		(1 << 1)
-#define EXYNOS_MIPI_PHY_MRESETN		(1 << 2)
-#define EXYNOS_MIPI_PHY_RESET_MASK	(3 << 1)
 
 enum exynos_mipi_phy_id {
 	EXYNOS_MIPI_PHY_ID_CSIS0,
@@ -38,43 +37,62 @@ enum exynos_mipi_phy_id {
 	((id) == EXYNOS_MIPI_PHY_ID_DSIM0 || (id) == EXYNOS_MIPI_PHY_ID_DSIM1)
 
 struct exynos_mipi_video_phy {
-	spinlock_t slock;
 	struct video_phy_desc {
 		struct phy *phy;
 		unsigned int index;
 	} phys[EXYNOS_MIPI_PHYS_NUM];
+	spinlock_t slock;
 	void __iomem *regs;
+	struct mutex mutex;
+	struct regmap *regmap;
 };
 
 static int __set_phy_state(struct exynos_mipi_video_phy *state,
 			enum exynos_mipi_phy_id id, unsigned int on)
 {
+	const unsigned int offset = EXYNOS4_MIPI_PHY_CONTROL(id / 2);
 	void __iomem *addr;
-	u32 reg, reset;
-
-	addr = state->regs + EXYNOS_MIPI_PHY_CONTROL(id / 2);
+	u32 val, reset;
 
 	if (is_mipi_dsim_phy_id(id))
-		reset = EXYNOS_MIPI_PHY_MRESETN;
-	else
-		reset = EXYNOS_MIPI_PHY_SRESETN;
-
-	spin_lock(&state->slock);
-	reg = readl(addr);
-	if (on)
-		reg |= reset;
+		reset = EXYNOS4_MIPI_PHY_MRESETN;
 	else
-		reg &= ~reset;
-	writel(reg, addr);
-
-	/* Clear ENABLE bit only if MRESETN, SRESETN bits are not set. */
-	if (on)
-		reg |= EXYNOS_MIPI_PHY_ENABLE;
-	else if (!(reg & EXYNOS_MIPI_PHY_RESET_MASK))
-		reg &= ~EXYNOS_MIPI_PHY_ENABLE;
+		reset = EXYNOS4_MIPI_PHY_SRESETN;
+
+	if (state->regmap) {
+		mutex_lock(&state->mutex);
+		regmap_read(state->regmap, offset, &val);
+		if (on)
+			val |= reset;
+		else
+			val &= ~reset;
+		regmap_write(state->regmap, offset, val);
+		if (on)
+			val |= EXYNOS4_MIPI_PHY_ENABLE;
+		else if (!(val & EXYNOS4_MIPI_PHY_RESET_MASK))
+			val &= ~EXYNOS4_MIPI_PHY_ENABLE;
+		regmap_write(state->regmap, offset, val);
+		mutex_unlock(&state->mutex);
+	} else {
+		addr = state->regs + EXYNOS_MIPI_PHY_CONTROL(id / 2);
+
+		spin_lock(&state->slock);
+		val = readl(addr);
+		if (on)
+			val |= reset;
+		else
+			val &= ~reset;
+		writel(val, addr);
+		/* Clear ENABLE bit only if MRESETN, SRESETN bits are not set */
+		if (on)
+			val |= EXYNOS4_MIPI_PHY_ENABLE;
+		else if (!(val & EXYNOS4_MIPI_PHY_RESET_MASK))
+			val &= ~EXYNOS4_MIPI_PHY_ENABLE;
+
+		writel(val, addr);
+		spin_unlock(&state->slock);
+	}
 
-	writel(reg, addr);
-	spin_unlock(&state->slock);
 	return 0;
 }
 
@@ -118,7 +136,6 @@ static int exynos_mipi_video_phy_probe(struct platform_device *pdev)
 {
 	struct exynos_mipi_video_phy *state;
 	struct device *dev = &pdev->dev;
-	struct resource *res;
 	struct phy_provider *phy_provider;
 	unsigned int i;
 
@@ -126,14 +143,22 @@ static int exynos_mipi_video_phy_probe(struct platform_device *pdev)
 	if (!state)
 		return -ENOMEM;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	state->regmap = syscon_regmap_lookup_by_phandle(dev->of_node, "syscon");
+	if (IS_ERR(state->regmap)) {
+		struct resource *res;
 
-	state->regs = devm_ioremap_resource(dev, res);
-	if (IS_ERR(state->regs))
-		return PTR_ERR(state->regs);
+		dev_info(dev, "regmap lookup failed: %ld\n",
+			 PTR_ERR(state->regmap));
+
+		res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+		state->regs = devm_ioremap_resource(dev, res);
+		if (IS_ERR(state->regs))
+			return PTR_ERR(state->regs);
+	}
 
 	dev_set_drvdata(dev, state);
 	spin_lock_init(&state->slock);
+	mutex_init(&state->mutex);
 
 	for (i = 0; i < EXYNOS_MIPI_PHYS_NUM; i++) {
 		struct phy *phy = devm_phy_create(dev, NULL,
diff --git a/include/linux/mfd/syscon/exynos4-pmu.h b/include/linux/mfd/syscon/exynos4-pmu.h
new file mode 100644
index 000000000000..278b1b1549e9
--- /dev/null
+++ b/include/linux/mfd/syscon/exynos4-pmu.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2015 Samsung Electronics Co., Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_MFD_SYSCON_PMU_EXYNOS4_H_
+#define _LINUX_MFD_SYSCON_PMU_EXYNOS4_H_
+
+/* Exynos4 PMU register definitions */
+
+/* MIPI_PHYn_CONTROL register offset: n = 0..1 */
+#define EXYNOS4_MIPI_PHY_CONTROL(n)	(0x710 + (n) * 4)
+#define EXYNOS4_MIPI_PHY_ENABLE		(1 << 0)
+#define EXYNOS4_MIPI_PHY_SRESETN	(1 << 1)
+#define EXYNOS4_MIPI_PHY_MRESETN	(1 << 2)
+#define EXYNOS4_MIPI_PHY_RESET_MASK	(3 << 1)
+
+#endif /* _LINUX_MFD_SYSCON_PMU_EXYNOS4_H_ */
-- 
cgit v1.2.3


From 074f9dd55f9cab1b82690ed7e44bcf38b9616ce0 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 29 Jan 2015 15:05:04 -0500
Subject: USB: add flag for HCDs that can't receive wakeup requests
 (isp1760-hcd)

Currently the USB stack assumes that all host controller drivers are
capable of receiving wakeup requests from downstream devices.
However, this isn't true for the isp1760-hcd driver, which means that
it isn't safe to do a runtime suspend of any device attached to a
root-hub port if the device requires wakeup.

This patch adds a "cant_recv_wakeups" flag to the usb_hcd structure
and sets the flag in isp1760-hcd.  The core is modified to prevent a
direct child of the root hub from being put into runtime suspend with
wakeup enabled if the flag is set.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Tested-by: Nicolas Pitre <nico@linaro.org>
CC: <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <greg@kroah.com>
---
 drivers/usb/core/driver.c      | 12 ++++++++++++
 drivers/usb/host/isp1760-hcd.c |  3 +++
 include/linux/usb/hcd.h        |  2 ++
 3 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index c76ec9758ce3..818369afff63 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -1780,6 +1780,18 @@ static int autosuspend_check(struct usb_device *udev)
 		dev_dbg(&udev->dev, "remote wakeup needed for autosuspend\n");
 		return -EOPNOTSUPP;
 	}
+
+	/*
+	 * If the device is a direct child of the root hub and the HCD
+	 * doesn't handle wakeup requests, don't allow autosuspend when
+	 * wakeup is needed.
+	 */
+	if (w && udev->parent == udev->bus->root_hub &&
+			bus_to_hcd(udev->bus)->cant_recv_wakeups) {
+		dev_dbg(&udev->dev, "HCD doesn't handle wakeup requests\n");
+		return -EOPNOTSUPP;
+	}
+
 	udev->do_remote_wakeup = w;
 	return 0;
 }
diff --git a/drivers/usb/host/isp1760-hcd.c b/drivers/usb/host/isp1760-hcd.c
index dbba455884f4..cecf39a220e7 100644
--- a/drivers/usb/host/isp1760-hcd.c
+++ b/drivers/usb/host/isp1760-hcd.c
@@ -2245,6 +2245,9 @@ struct usb_hcd *isp1760_register(phys_addr_t res_start, resource_size_t res_len,
 	hcd->rsrc_start = res_start;
 	hcd->rsrc_len = res_len;
 
+	/* This driver doesn't support wakeup requests */
+	hcd->cant_recv_wakeups = 1;
+
 	ret = usb_add_hcd(hcd, irq, irqflags);
 	if (ret)
 		goto err_unmap;
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 8968f616e414..68b1e836dff1 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -146,6 +146,8 @@ struct usb_hcd {
 	unsigned		amd_resume_bug:1; /* AMD remote wakeup quirk */
 	unsigned		can_do_streams:1; /* HC supports streams */
 	unsigned		tpl_support:1; /* OTG & EH TPL support */
+	unsigned		cant_recv_wakeups:1;
+			/* wakeup requests from downstream aren't received */
 
 	unsigned int		irq;		/* irq allocated */
 	void __iomem		*regs;		/* device memory/io */
-- 
cgit v1.2.3


From 15d0f5ea348b9c4e6d41df294dde38a56a39c7bf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 2 Feb 2015 10:07:59 -0700
Subject: Make super_blocks and sb_lock static

The only user outside of fs/super.c is gone now

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/super.c         | 4 ++--
 include/linux/fs.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 3b4dadafdd60..05a021638b11 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,8 +36,8 @@
 #include "internal.h"
 
 
-LIST_HEAD(super_blocks);
-DEFINE_SPINLOCK(sb_lock);
+static LIST_HEAD(super_blocks);
+static DEFINE_SPINLOCK(sb_lock);
 
 static char *sb_writers_name[SB_FREEZE_LEVELS] = {
 	"sb_writers",
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 65d02de342e1..2f717baefdf8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1183,8 +1183,6 @@ struct mm_struct;
 #define UMOUNT_NOFOLLOW	0x00000008	/* Don't follow symlink on umount */
 #define UMOUNT_UNUSED	0x80000000	/* Flag guaranteed to be unused */
 
-extern struct list_head super_blocks;
-extern spinlock_t sb_lock;
 
 /* Possible states of 'frozen' field */
 enum {
-- 
cgit v1.2.3


From 6cae0a4648c0db2a74efb816cd2ce84390c90480 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 16 Aug 2014 13:31:51 +0200
Subject: nfs: add LAYOUT_TYPE_MAX enum value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This gives us a nice upper bound for later use in nfѕd.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nfs4.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 022b761dbf0a..8a3589c2542c 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -516,6 +516,7 @@ enum pnfs_layouttype {
 	LAYOUT_NFSV4_1_FILES  = 1,
 	LAYOUT_OSD2_OBJECTS = 2,
 	LAYOUT_BLOCK_VOLUME = 3,
+	LAYOUT_TYPE_MAX
 };
 
 /* used for both layout return and recall */
-- 
cgit v1.2.3


From 11afe9f76e121e960445deee5b7f26f0787a1990 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 21 Jan 2015 19:17:03 +0100
Subject: fs: add FL_LAYOUT lease type

This (ab-)uses the file locking code to allow filesystems to recall
outstanding pNFS layouts on a file.  This new lease type is similar but
not quite the same as FL_DELEG.  A FL_LAYOUT lease can always be granted,
an a per-filesystem lock (XFS iolock for the initial implementation)
ensures not FL_LAYOUT leases granted when we would need to recall them.

Also included are changes that allow multiple outstanding read
leases of different types on the same file as long as they have a
differnt owner.  This wasn't a problem until now as nfsd never set
FL_LEASE leases, and no one else used FL_DELEG leases, but given that
nfsd will also issues FL_LAYOUT leases we will have to handle it now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/locks.c         | 14 ++++++++++----
 include/linux/fs.h | 16 ++++++++++++++++
 2 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 22ac7694cc84..4753218f308e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,7 +137,7 @@
 
 #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG))
+#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
 #define IS_OFDLCK(fl)	(fl->fl_flags & FL_OFDLCK)
 
 static bool lease_breaking(struct file_lock *fl)
@@ -1371,6 +1371,8 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 
 static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 {
+	if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
+		return false;
 	if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
 		return false;
 	return locks_conflict(breaker, lease);
@@ -1594,11 +1596,14 @@ int fcntl_getlease(struct file *filp)
  * conflict with the lease we're trying to set.
  */
 static int
-check_conflicting_open(const struct dentry *dentry, const long arg)
+check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
 {
 	int ret = 0;
 	struct inode *inode = dentry->d_inode;
 
+	if (flags & FL_LAYOUT)
+		return 0;
+
 	if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
 		return -EAGAIN;
 
@@ -1647,7 +1652,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 
 	spin_lock(&ctx->flc_lock);
 	time_out_leases(inode, &dispose);
-	error = check_conflicting_open(dentry, arg);
+	error = check_conflicting_open(dentry, arg, lease->fl_flags);
 	if (error)
 		goto out;
 
@@ -1703,7 +1708,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	 * precedes these checks.
 	 */
 	smp_mb();
-	error = check_conflicting_open(dentry, arg);
+	error = check_conflicting_open(dentry, arg, lease->fl_flags);
 	if (error) {
 		locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt);
 		goto out;
@@ -1787,6 +1792,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
 			WARN_ON_ONCE(1);
 			return -ENOLCK;
 		}
+
 		return generic_add_lease(filp, arg, flp, priv);
 	default:
 		return -EINVAL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ddd2fa7cefd3..84740145f835 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -875,6 +875,7 @@ static inline struct file *get_file(struct file *f)
 #define FL_DOWNGRADE_PENDING	256 /* Lease is being downgraded */
 #define FL_UNLOCK_PENDING	512 /* Lease is being broken */
 #define FL_OFDLCK	1024	/* lock is "owned" by struct file */
+#define FL_LAYOUT	2048	/* outstanding pNFS layout */
 
 /*
  * Special return value from posix_lock_file() and vfs_lock_file() for
@@ -2037,6 +2038,16 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
 	return ret;
 }
 
+static inline int break_layout(struct inode *inode, bool wait)
+{
+	smp_mb();
+	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
+		return __break_lease(inode,
+				wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
+				FL_LAYOUT);
+	return 0;
+}
+
 #else /* !CONFIG_FILE_LOCKING */
 static inline int locks_mandatory_locked(struct file *file)
 {
@@ -2092,6 +2103,11 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
 	return 0;
 }
 
+static inline int break_layout(struct inode *inode, bool wait)
+{
+	return 0;
+}
+
 #endif /* CONFIG_FILE_LOCKING */
 
 /* fs/open.c */
-- 
cgit v1.2.3


From 9cf514ccfacb301f3b1b4509a8ce25dffad55880 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 5 May 2014 13:11:59 +0200
Subject: nfsd: implement pNFS operations

Add support for the GETDEVICEINFO, LAYOUTGET, LAYOUTCOMMIT and
LAYOUTRETURN NFSv4.1 operations, as well as backing code to manage
outstanding layouts and devices.

Layout management is very straight forward, with a nfs4_layout_stateid
structure that extends nfs4_stid to manage layout stateids as the
top-level structure.  It is linked into the nfs4_file and nfs4_client
structures like the other stateids, and contains a linked list of
layouts that hang of the stateid.  The actual layout operations are
implemented in layout drivers that are not part of this commit, but
will be added later.

The worst part of this commit is the management of the pNFS device IDs,
which suffers from a specification that is not sanely implementable due
to the fact that the device-IDs are global and not bound to an export,
and have a small enough size so that we can't store the fsid portion of
a file handle, and must never be reused.  As we still do need perform all
export authentication and validation checks on a device ID passed to
GETDEVICEINFO we are caught between a rock and a hard place.  To work
around this issue we add a new hash that maps from a 64-bit integer to a
fsid so that we can look up the export to authenticate against it,
a 32-bit integer as a generation that we can bump when changing the device,
and a currently unused 32-bit integer that could be used in the future
to handle more than a single device per export.  Entries in this hash
table are never deleted as we can't reuse the ids anyway, and would have
a severe lifetime problem anyway as Linux export structures are temporary
structures that can go away under load.

Parts of the XDR data, structures and marshaling/unmarshaling code, as
well as many concepts are derived from the old pNFS server implementation
from Andy Adamson, Benny Halevy, Dean Hildebrand, Marc Eshel, Fred Isaman,
Mike Sager, Ricardo Labiaga and many others.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfsd/Kconfig                  |  10 +
 fs/nfsd/Makefile                 |   1 +
 fs/nfsd/export.c                 |   8 +
 fs/nfsd/export.h                 |   2 +
 fs/nfsd/nfs4layouts.c            | 487 +++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4proc.c               | 302 ++++++++++++++++++++++++
 fs/nfsd/nfs4state.c              |  16 +-
 fs/nfsd/nfs4xdr.c                | 312 +++++++++++++++++++++++++
 fs/nfsd/nfsctl.c                 |   9 +-
 fs/nfsd/nfsd.h                   |  16 +-
 fs/nfsd/pnfs.h                   |  80 +++++++
 fs/nfsd/state.h                  |  21 ++
 fs/nfsd/xdr4.h                   |  59 +++++
 include/linux/nfs4.h             |   1 +
 include/uapi/linux/nfsd/debug.h  |   1 +
 include/uapi/linux/nfsd/export.h |   4 +-
 16 files changed, 1324 insertions(+), 5 deletions(-)
 create mode 100644 fs/nfsd/nfs4layouts.c
 create mode 100644 fs/nfsd/pnfs.h

(limited to 'include/linux')

diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 73395156bdb4..683bf718aead 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -82,6 +82,16 @@ config NFSD_V4
 
 	  If unsure, say N.
 
+config NFSD_PNFS
+	bool "NFSv4.1 server support for Parallel NFS (pNFS)"
+	depends on NFSD_V4
+	help
+	  This option enables support for the parallel NFS features of the
+	  minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
+	  server.
+
+	  If unsure, say N.
+
 config NFSD_V4_SECURITY_LABEL
 	bool "Provide Security Label support for NFSv4 server"
 	depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index af32ef06b4fe..5806270a8567 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -12,3 +12,4 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
 			   nfs4acl.o nfs4callback.o nfs4recover.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 30a739d896ff..c3e3b6e55ae2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -20,6 +20,7 @@
 #include "nfsd.h"
 #include "nfsfh.h"
 #include "netns.h"
+#include "pnfs.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_EXPORT
 
@@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	exp.ex_client = dom;
 	exp.cd = cd;
+	exp.ex_devid_map = NULL;
 
 	/* expiry */
 	err = -EINVAL;
@@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		if (!gid_valid(exp.ex_anon_gid))
 			goto out4;
 		err = 0;
+
+		nfsd4_setup_layout_type(&exp);
 	}
 
 	expp = svc_export_lookup(&exp);
@@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_fslocs.locations = NULL;
 	new->ex_fslocs.locations_count = 0;
 	new->ex_fslocs.migrated = 0;
+	new->ex_layout_type = 0;
 	new->ex_uuid = NULL;
 	new->cd = item->cd;
 }
@@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_anon_uid = item->ex_anon_uid;
 	new->ex_anon_gid = item->ex_anon_gid;
 	new->ex_fsid = item->ex_fsid;
+	new->ex_devid_map = item->ex_devid_map;
+	item->ex_devid_map = NULL;
 	new->ex_uuid = item->ex_uuid;
 	item->ex_uuid = NULL;
 	new->ex_fslocs.locations = item->ex_fslocs.locations;
@@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	item->ex_fslocs.locations_count = 0;
 	new->ex_fslocs.migrated = item->ex_fslocs.migrated;
 	item->ex_fslocs.migrated = 0;
+	new->ex_layout_type = item->ex_layout_type;
 	new->ex_nflavors = item->ex_nflavors;
 	for (i = 0; i < MAX_SECINFO_LIST; i++) {
 		new->ex_flavors[i] = item->ex_flavors[i];
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 04dc8c167b0c..1f52bfcc436f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -56,6 +56,8 @@ struct svc_export {
 	struct nfsd4_fs_locations ex_fslocs;
 	uint32_t		ex_nflavors;
 	struct exp_flavor_info	ex_flavors[MAX_SECINFO_LIST];
+	enum pnfs_layouttype	ex_layout_type;
+	struct nfsd4_deviceid_map *ex_devid_map;
 	struct cache_detail	*cd;
 };
 
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000000..8273270418b1
--- /dev/null
+++ b/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/jhash.h>
+#include <linux/sched.h>
+
+#include "pnfs.h"
+#include "netns.h"
+
+#define NFSDDBG_FACILITY                NFSDDBG_PNFS
+
+struct nfs4_layout {
+	struct list_head		lo_perstate;
+	struct nfs4_layout_stateid	*lo_state;
+	struct nfsd4_layout_seg		lo_seg;
+};
+
+static struct kmem_cache *nfs4_layout_cache;
+static struct kmem_cache *nfs4_layout_stateid_cache;
+
+const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
+};
+
+/* pNFS device ID to export fsid mapping */
+#define DEVID_HASH_BITS	8
+#define DEVID_HASH_SIZE	(1 << DEVID_HASH_BITS)
+#define DEVID_HASH_MASK	(DEVID_HASH_SIZE - 1)
+static u64 nfsd_devid_seq = 1;
+static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfsd_devid_lock);
+
+static inline u32 devid_hashfn(u64 idx)
+{
+	return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
+}
+
+static void
+nfsd4_alloc_devid_map(const struct svc_fh *fhp)
+{
+	const struct knfsd_fh *fh = &fhp->fh_handle;
+	size_t fsid_len = key_len(fh->fh_fsid_type);
+	struct nfsd4_deviceid_map *map, *old;
+	int i;
+
+	map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
+	if (!map)
+		return;
+
+	map->fsid_type = fh->fh_fsid_type;
+	memcpy(&map->fsid, fh->fh_fsid, fsid_len);
+
+	spin_lock(&nfsd_devid_lock);
+	if (fhp->fh_export->ex_devid_map)
+		goto out_unlock;
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++) {
+		list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
+			if (old->fsid_type != fh->fh_fsid_type)
+				continue;
+			if (memcmp(old->fsid, fh->fh_fsid,
+					key_len(old->fsid_type)))
+				continue;
+
+			fhp->fh_export->ex_devid_map = old;
+			goto out_unlock;
+		}
+	}
+
+	map->idx = nfsd_devid_seq++;
+	list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
+	fhp->fh_export->ex_devid_map = map;
+	map = NULL;
+
+out_unlock:
+	spin_unlock(&nfsd_devid_lock);
+	kfree(map);
+}
+
+struct nfsd4_deviceid_map *
+nfsd4_find_devid_map(int idx)
+{
+	struct nfsd4_deviceid_map *map, *ret = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
+		if (map->idx == idx)
+			ret = map;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+int
+nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+		u32 device_generation)
+{
+	if (!fhp->fh_export->ex_devid_map) {
+		nfsd4_alloc_devid_map(fhp);
+		if (!fhp->fh_export->ex_devid_map)
+			return -ENOMEM;
+	}
+
+	id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
+	id->generation = device_generation;
+	id->pad = 0;
+	return 0;
+}
+
+void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+	if (exp->ex_flags & NFSEXP_NOPNFS)
+		return;
+}
+
+static void
+nfsd4_free_layout_stateid(struct nfs4_stid *stid)
+{
+	struct nfs4_layout_stateid *ls = layoutstateid(stid);
+	struct nfs4_client *clp = ls->ls_stid.sc_client;
+	struct nfs4_file *fp = ls->ls_stid.sc_file;
+
+	spin_lock(&clp->cl_lock);
+	list_del_init(&ls->ls_perclnt);
+	spin_unlock(&clp->cl_lock);
+
+	spin_lock(&fp->fi_lock);
+	list_del_init(&ls->ls_perfile);
+	spin_unlock(&fp->fi_lock);
+
+	kmem_cache_free(nfs4_layout_stateid_cache, ls);
+}
+
+static struct nfs4_layout_stateid *
+nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
+		struct nfs4_stid *parent, u32 layout_type)
+{
+	struct nfs4_client *clp = cstate->clp;
+	struct nfs4_file *fp = parent->sc_file;
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_stid *stp;
+
+	stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
+	if (!stp)
+		return NULL;
+	stp->sc_free = nfsd4_free_layout_stateid;
+	get_nfs4_file(fp);
+	stp->sc_file = fp;
+
+	ls = layoutstateid(stp);
+	INIT_LIST_HEAD(&ls->ls_perclnt);
+	INIT_LIST_HEAD(&ls->ls_perfile);
+	spin_lock_init(&ls->ls_lock);
+	INIT_LIST_HEAD(&ls->ls_layouts);
+	ls->ls_layout_type = layout_type;
+
+	spin_lock(&clp->cl_lock);
+	stp->sc_type = NFS4_LAYOUT_STID;
+	list_add(&ls->ls_perclnt, &clp->cl_lo_states);
+	spin_unlock(&clp->cl_lock);
+
+	spin_lock(&fp->fi_lock);
+	list_add(&ls->ls_perfile, &fp->fi_lo_states);
+	spin_unlock(&fp->fi_lock);
+
+	return ls;
+}
+
+__be32
+nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate, stateid_t *stateid,
+		bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
+{
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_stid *stid;
+	unsigned char typemask = NFS4_LAYOUT_STID;
+	__be32 status;
+
+	if (create)
+		typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+
+	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+			net_generic(SVC_NET(rqstp), nfsd_net_id));
+	if (status)
+		goto out;
+
+	if (!fh_match(&cstate->current_fh.fh_handle,
+		      &stid->sc_file->fi_fhandle)) {
+		status = nfserr_bad_stateid;
+		goto out_put_stid;
+	}
+
+	if (stid->sc_type != NFS4_LAYOUT_STID) {
+		ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
+		nfs4_put_stid(stid);
+
+		status = nfserr_jukebox;
+		if (!ls)
+			goto out;
+	} else {
+		ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
+
+		status = nfserr_bad_stateid;
+		if (stateid->si_generation > stid->sc_stateid.si_generation)
+			goto out_put_stid;
+		if (layout_type != ls->ls_layout_type)
+			goto out_put_stid;
+	}
+
+	*lsp = ls;
+	return 0;
+
+out_put_stid:
+	nfs4_put_stid(stid);
+out:
+	return status;
+}
+
+static inline u64
+layout_end(struct nfsd4_layout_seg *seg)
+{
+	u64 end = seg->offset + seg->length;
+	return end >= seg->offset ? end : NFS4_MAX_UINT64;
+}
+
+static void
+layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
+{
+	if (end == NFS4_MAX_UINT64)
+		lo->length = NFS4_MAX_UINT64;
+	else
+		lo->length = end - lo->offset;
+}
+
+static bool
+layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
+{
+	if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
+		return false;
+	if (layout_end(&lo->lo_seg) <= s->offset)
+		return false;
+	if (layout_end(s) <= lo->lo_seg.offset)
+		return false;
+	return true;
+}
+
+static bool
+layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
+{
+	if (lo->iomode != new->iomode)
+		return false;
+	if (layout_end(new) < lo->offset)
+		return false;
+	if (layout_end(lo) < new->offset)
+		return false;
+
+	lo->offset = min(lo->offset, new->offset);
+	layout_update_len(lo, max(layout_end(lo), layout_end(new)));
+	return true;
+}
+
+__be32
+nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
+{
+	struct nfsd4_layout_seg *seg = &lgp->lg_seg;
+	struct nfs4_layout *lp, *new = NULL;
+
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+		if (layouts_try_merge(&lp->lo_seg, seg))
+			goto done;
+	}
+	spin_unlock(&ls->ls_lock);
+
+	new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
+	if (!new)
+		return nfserr_jukebox;
+	memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+	new->lo_state = ls;
+
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+		if (layouts_try_merge(&lp->lo_seg, seg))
+			goto done;
+	}
+
+	atomic_inc(&ls->ls_stid.sc_count);
+	list_add_tail(&new->lo_perstate, &ls->ls_layouts);
+	new = NULL;
+done:
+	update_stateid(&ls->ls_stid.sc_stateid);
+	memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+	spin_unlock(&ls->ls_lock);
+	if (new)
+		kmem_cache_free(nfs4_layout_cache, new);
+	return nfs_ok;
+}
+
+static void
+nfsd4_free_layouts(struct list_head *reaplist)
+{
+	while (!list_empty(reaplist)) {
+		struct nfs4_layout *lp = list_first_entry(reaplist,
+				struct nfs4_layout, lo_perstate);
+
+		list_del(&lp->lo_perstate);
+		nfs4_put_stid(&lp->lo_state->ls_stid);
+		kmem_cache_free(nfs4_layout_cache, lp);
+	}
+}
+
+static void
+nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
+		struct list_head *reaplist)
+{
+	struct nfsd4_layout_seg *lo = &lp->lo_seg;
+	u64 end = layout_end(lo);
+
+	if (seg->offset <= lo->offset) {
+		if (layout_end(seg) >= end) {
+			list_move_tail(&lp->lo_perstate, reaplist);
+			return;
+		}
+		end = seg->offset;
+	} else {
+		/* retain the whole layout segment on a split. */
+		if (layout_end(seg) < end) {
+			dprintk("%s: split not supported\n", __func__);
+			return;
+		}
+
+		lo->offset = layout_end(seg);
+	}
+
+	layout_update_len(lo, end);
+}
+
+__be32
+nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_layout *lp, *n;
+	LIST_HEAD(reaplist);
+	__be32 nfserr;
+	int found = 0;
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
+						false, lrp->lr_layout_type,
+						&ls);
+	if (nfserr)
+		return nfserr;
+
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
+		if (layouts_overlapping(lp, &lrp->lr_seg)) {
+			nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
+			found++;
+		}
+	}
+	if (!list_empty(&ls->ls_layouts)) {
+		if (found) {
+			update_stateid(&ls->ls_stid.sc_stateid);
+			memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
+				sizeof(stateid_t));
+		}
+		lrp->lrs_present = 1;
+	} else {
+		nfs4_unhash_stid(&ls->ls_stid);
+		lrp->lrs_present = 0;
+	}
+	spin_unlock(&ls->ls_lock);
+
+	nfs4_put_stid(&ls->ls_stid);
+	nfsd4_free_layouts(&reaplist);
+	return nfs_ok;
+}
+
+__be32
+nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	struct nfs4_client *clp = cstate->clp;
+	struct nfs4_layout *lp, *t;
+	LIST_HEAD(reaplist);
+
+	lrp->lrs_present = 0;
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+		if (lrp->lr_return_type == RETURN_FSID &&
+		    !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
+				   &cstate->current_fh.fh_handle))
+			continue;
+
+		spin_lock(&ls->ls_lock);
+		list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
+			if (lrp->lr_seg.iomode == IOMODE_ANY ||
+			    lrp->lr_seg.iomode == lp->lo_seg.iomode)
+				list_move_tail(&lp->lo_perstate, &reaplist);
+		}
+		spin_unlock(&ls->ls_lock);
+	}
+	spin_unlock(&clp->cl_lock);
+
+	nfsd4_free_layouts(&reaplist);
+	return 0;
+}
+
+static void
+nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
+		struct list_head *reaplist)
+{
+	spin_lock(&ls->ls_lock);
+	list_splice_init(&ls->ls_layouts, reaplist);
+	spin_unlock(&ls->ls_lock);
+}
+
+void
+nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	LIST_HEAD(reaplist);
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
+		nfsd4_return_all_layouts(ls, &reaplist);
+	spin_unlock(&clp->cl_lock);
+
+	nfsd4_free_layouts(&reaplist);
+}
+
+void
+nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	LIST_HEAD(reaplist);
+
+	spin_lock(&fp->fi_lock);
+	list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
+		if (ls->ls_stid.sc_client == clp)
+			nfsd4_return_all_layouts(ls, &reaplist);
+	}
+	spin_unlock(&fp->fi_lock);
+
+	nfsd4_free_layouts(&reaplist);
+}
+
+int
+nfsd4_init_pnfs(void)
+{
+	int i;
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&nfsd_devid_hash[i]);
+
+	nfs4_layout_cache = kmem_cache_create("nfs4_layout",
+			sizeof(struct nfs4_layout), 0, 0, NULL);
+	if (!nfs4_layout_cache)
+		return -ENOMEM;
+
+	nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
+			sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+	if (!nfs4_layout_stateid_cache) {
+		kmem_cache_destroy(nfs4_layout_cache);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void
+nfsd4_exit_pnfs(void)
+{
+	int i;
+
+	kmem_cache_destroy(nfs4_layout_cache);
+	kmem_cache_destroy(nfs4_layout_stateid_cache);
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++) {
+		struct nfsd4_deviceid_map *map, *n;
+
+		list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
+			kfree(map);
+	}
+}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ac71d13c69ef..2b91443497cc 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -43,6 +43,7 @@
 #include "current_stateid.h"
 #include "netns.h"
 #include "acl.h"
+#include "pnfs.h"
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -1178,6 +1179,252 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status == nfserr_same ? nfs_ok : status;
 }
 
+#ifdef CONFIG_NFSD_PNFS
+static const struct nfsd4_layout_ops *
+nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
+{
+	if (!exp->ex_layout_type) {
+		dprintk("%s: export does not support pNFS\n", __func__);
+		return NULL;
+	}
+
+	if (exp->ex_layout_type != layout_type) {
+		dprintk("%s: layout type %d not supported\n",
+			__func__, layout_type);
+		return NULL;
+	}
+
+	return nfsd4_layout_ops[layout_type];
+}
+
+static __be32
+nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	const struct nfsd4_layout_ops *ops;
+	struct nfsd4_deviceid_map *map;
+	struct svc_export *exp;
+	__be32 nfserr;
+
+	dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
+	       __func__,
+	       gdp->gd_layout_type,
+	       gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
+	       gdp->gd_maxcount);
+
+	map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
+	if (!map) {
+		dprintk("%s: couldn't find device ID to export mapping!\n",
+			__func__);
+		return nfserr_noent;
+	}
+
+	exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
+	if (IS_ERR(exp)) {
+		dprintk("%s: could not find device id\n", __func__);
+		return nfserr_noent;
+	}
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
+	if (!ops)
+		goto out;
+
+	nfserr = nfs_ok;
+	if (gdp->gd_maxcount != 0)
+		nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+
+	gdp->gd_notify_types &= ops->notify_types;
+	exp_put(exp);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutget(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutget *lgp)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	const struct nfsd4_layout_ops *ops;
+	struct nfs4_layout_stateid *ls;
+	__be32 nfserr;
+	int accmode;
+
+	switch (lgp->lg_seg.iomode) {
+	case IOMODE_READ:
+		accmode = NFSD_MAY_READ;
+		break;
+	case IOMODE_RW:
+		accmode = NFSD_MAY_READ | NFSD_MAY_WRITE;
+		break;
+	default:
+		dprintk("%s: invalid iomode %d\n",
+			__func__, lgp->lg_seg.iomode);
+		nfserr = nfserr_badiomode;
+		goto out;
+	}
+
+	nfserr = fh_verify(rqstp, current_fh, 0, accmode);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
+	if (!ops)
+		goto out;
+
+	/*
+	 * Verify minlength and range as per RFC5661:
+	 *  o  If loga_length is less than loga_minlength,
+	 *     the metadata server MUST return NFS4ERR_INVAL.
+	 *  o  If the sum of loga_offset and loga_minlength exceeds
+	 *     NFS4_UINT64_MAX, and loga_minlength is not
+	 *     NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
+	 *  o  If the sum of loga_offset and loga_length exceeds
+	 *     NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
+	 *     the error NFS4ERR_INVAL MUST result.
+	 */
+	nfserr = nfserr_inval;
+	if (lgp->lg_seg.length < lgp->lg_minlength ||
+	    (lgp->lg_minlength != NFS4_MAX_UINT64 &&
+	     lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
+	    (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
+	     lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
+		goto out;
+	if (lgp->lg_seg.length == 0)
+		goto out;
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
+						true, lgp->lg_layout_type, &ls);
+	if (nfserr)
+		goto out;
+
+	nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
+				     current_fh, lgp);
+	if (nfserr)
+		goto out_put_stid;
+
+	nfserr = nfsd4_insert_layout(lgp, ls);
+
+out_put_stid:
+	nfs4_put_stid(&ls->ls_stid);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutcommit(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutcommit *lcp)
+{
+	const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
+	struct svc_fh *current_fh = &cstate->current_fh;
+	const struct nfsd4_layout_ops *ops;
+	loff_t new_size = lcp->lc_last_wr + 1;
+	struct inode *inode;
+	struct nfs4_layout_stateid *ls;
+	__be32 nfserr;
+
+	nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
+	if (!ops)
+		goto out;
+	inode = current_fh->fh_dentry->d_inode;
+
+	nfserr = nfserr_inval;
+	if (new_size <= seg->offset) {
+		dprintk("pnfsd: last write before layout segment\n");
+		goto out;
+	}
+	if (new_size > seg->offset + seg->length) {
+		dprintk("pnfsd: last write beyond layout segment\n");
+		goto out;
+	}
+	if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
+		dprintk("pnfsd: layoutcommit beyond EOF\n");
+		goto out;
+	}
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
+						false, lcp->lc_layout_type,
+						&ls);
+	if (nfserr) {
+		/* fixup error code as per RFC5661 */
+		if (nfserr == nfserr_bad_stateid)
+			nfserr = nfserr_badlayout;
+		goto out;
+	}
+
+	nfserr = ops->proc_layoutcommit(inode, lcp);
+	if (nfserr)
+		goto out_put_stid;
+
+	if (new_size > i_size_read(inode)) {
+		lcp->lc_size_chg = 1;
+		lcp->lc_newsize = new_size;
+	} else {
+		lcp->lc_size_chg = 0;
+	}
+
+out_put_stid:
+	nfs4_put_stid(&ls->ls_stid);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutreturn(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	__be32 nfserr;
+
+	nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
+		goto out;
+
+	switch (lrp->lr_seg.iomode) {
+	case IOMODE_READ:
+	case IOMODE_RW:
+	case IOMODE_ANY:
+		break;
+	default:
+		dprintk("%s: invalid iomode %d\n", __func__,
+			lrp->lr_seg.iomode);
+		nfserr = nfserr_inval;
+		goto out;
+	}
+
+	switch (lrp->lr_return_type) {
+	case RETURN_FILE:
+		nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
+		break;
+	case RETURN_FSID:
+	case RETURN_ALL:
+		nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
+		break;
+	default:
+		dprintk("%s: invalid return_type %d\n", __func__,
+			lrp->lr_return_type);
+		nfserr = nfserr_inval;
+		break;
+	}
+out:
+	return nfserr;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 /*
  * NULL call.
  */
@@ -1679,6 +1926,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
 		op_encode_channel_attrs_maxsz) * sizeof(__be32);
 }
 
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * At this stage we don't really know what layout driver will handle the request,
+ * so we need to define an arbitrary upper bound here.
+ */
+#define MAX_LAYOUT_SIZE		128
+static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* logr_return_on_close */ +
+		op_encode_stateid_maxsz +
+		1 /* nr of layouts */ +
+		MAX_LAYOUT_SIZE) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* locr_newsize */ +
+		2 /* ns_size */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* lrs_stateid */ +
+		op_encode_stateid_maxsz) * sizeof(__be32);
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_ACCESS] = {
 		.op_func = (nfsd4op_func)nfsd4_access,
@@ -1966,6 +2243,31 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO] = {
+		.op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
+		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_name = "OP_GETDEVICEINFO",
+	},
+	[OP_LAYOUTGET] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutget,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTGET",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize,
+	},
+	[OP_LAYOUTCOMMIT] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutcommit,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTCOMMIT",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize,
+	},
+	[OP_LAYOUTRETURN] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutreturn,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTRETURN",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize,
+	},
+#endif /* CONFIG_NFSD_PNFS */
 
 	/* NFSv4.2 operations */
 	[OP_ALLOCATE] = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index eefd29ec43f2..c89f79dc69e2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -48,6 +48,7 @@
 #include "current_stateid.h"
 
 #include "netns.h"
+#include "pnfs.h"
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -1539,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 	INIT_LIST_HEAD(&clp->cl_lru);
 	INIT_LIST_HEAD(&clp->cl_callbacks);
 	INIT_LIST_HEAD(&clp->cl_revoked);
+#ifdef CONFIG_NFSD_PNFS
+	INIT_LIST_HEAD(&clp->cl_lo_states);
+#endif
 	spin_lock_init(&clp->cl_lock);
 	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
 	return clp;
@@ -1643,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp)
 		nfs4_get_stateowner(&oo->oo_owner);
 		release_openowner(oo);
 	}
+	nfsd4_return_all_client_layouts(clp);
 	nfsd4_shutdown_callback(clp);
 	if (clp->cl_cb_conn.cb_xprt)
 		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2126,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 static void
 nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
 {
-	/* pNFS is not supported */
+#ifdef CONFIG_NFSD_PNFS
+	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
+#else
 	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+#endif
 
 	/* Referrals are supported, Migration is not. */
 	new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
@@ -3055,6 +3063,9 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
 	fp->fi_share_deny = 0;
 	memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
 	memset(fp->fi_access, 0, sizeof(fp->fi_access));
+#ifdef CONFIG_NFSD_PNFS
+	INIT_LIST_HEAD(&fp->fi_lo_states);
+#endif
 	hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
 }
 
@@ -4841,6 +4852,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
+	nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
+				      stp->st_stid.sc_file);
+
 	nfsd4_close_open_stateid(stp);
 
 	/* put reference from nfs4_preprocess_seqid_op */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 974533e5a427..df5e66caf100 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -47,6 +47,7 @@
 #include "state.h"
 #include "cache.h"
 #include "netns.h"
+#include "pnfs.h"
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -1522,6 +1523,127 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
 	DECODE_TAIL;
 }
 
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
+		struct nfsd4_getdeviceinfo *gdev)
+{
+	DECODE_HEAD;
+	u32 num, i;
+
+	READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
+	COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
+	gdev->gd_layout_type = be32_to_cpup(p++);
+	gdev->gd_maxcount = be32_to_cpup(p++);
+	num = be32_to_cpup(p++);
+	if (num) {
+		READ_BUF(4 * num);
+		gdev->gd_notify_types = be32_to_cpup(p++);
+		for (i = 1; i < num; i++) {
+			if (be32_to_cpup(p++)) {
+				status = nfserr_inval;
+				goto out;
+			}
+		}
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutget *lgp)
+{
+	DECODE_HEAD;
+
+	READ_BUF(36);
+	lgp->lg_signal = be32_to_cpup(p++);
+	lgp->lg_layout_type = be32_to_cpup(p++);
+	lgp->lg_seg.iomode = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
+	p = xdr_decode_hyper(p, &lgp->lg_seg.length);
+	p = xdr_decode_hyper(p, &lgp->lg_minlength);
+	nfsd4_decode_stateid(argp, &lgp->lg_sid);
+	READ_BUF(4);
+	lgp->lg_maxcount = be32_to_cpup(p++);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutcommit *lcp)
+{
+	DECODE_HEAD;
+	u32 timechange;
+
+	READ_BUF(20);
+	p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
+	p = xdr_decode_hyper(p, &lcp->lc_seg.length);
+	lcp->lc_reclaim = be32_to_cpup(p++);
+	nfsd4_decode_stateid(argp, &lcp->lc_sid);
+	READ_BUF(4);
+	lcp->lc_newoffset = be32_to_cpup(p++);
+	if (lcp->lc_newoffset) {
+		READ_BUF(8);
+		p = xdr_decode_hyper(p, &lcp->lc_last_wr);
+	} else
+		lcp->lc_last_wr = 0;
+	READ_BUF(4);
+	timechange = be32_to_cpup(p++);
+	if (timechange) {
+		status = nfsd4_decode_time(argp, &lcp->lc_mtime);
+		if (status)
+			return status;
+	} else {
+		lcp->lc_mtime.tv_nsec = UTIME_NOW;
+	}
+	READ_BUF(8);
+	lcp->lc_layout_type = be32_to_cpup(p++);
+
+	/*
+	 * Save the layout update in XDR format and let the layout driver deal
+	 * with it later.
+	 */
+	lcp->lc_up_len = be32_to_cpup(p++);
+	if (lcp->lc_up_len > 0) {
+		READ_BUF(lcp->lc_up_len);
+		READMEM(lcp->lc_up_layout, lcp->lc_up_len);
+	}
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutreturn *lrp)
+{
+	DECODE_HEAD;
+
+	READ_BUF(16);
+	lrp->lr_reclaim = be32_to_cpup(p++);
+	lrp->lr_layout_type = be32_to_cpup(p++);
+	lrp->lr_seg.iomode = be32_to_cpup(p++);
+	lrp->lr_return_type = be32_to_cpup(p++);
+	if (lrp->lr_return_type == RETURN_FILE) {
+		READ_BUF(16);
+		p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
+		p = xdr_decode_hyper(p, &lrp->lr_seg.length);
+		nfsd4_decode_stateid(argp, &lrp->lr_sid);
+		READ_BUF(4);
+		lrp->lrf_body_len = be32_to_cpup(p++);
+		if (lrp->lrf_body_len > 0) {
+			READ_BUF(lrp->lrf_body_len);
+			READMEM(lrp->lrf_body, lrp->lrf_body_len);
+		}
+	} else {
+		lrp->lr_seg.offset = 0;
+		lrp->lr_seg.length = NFS4_MAX_UINT64;
+	}
+
+	DECODE_TAIL;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 static __be32
 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
 		       struct nfsd4_fallocate *fallocate)
@@ -1616,11 +1738,19 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_DESTROY_SESSION]	= (nfsd4_dec)nfsd4_decode_destroy_session,
 	[OP_FREE_STATEID]	= (nfsd4_dec)nfsd4_decode_free_stateid,
 	[OP_GET_DIR_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_getdeviceinfo,
+	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_layoutcommit,
+	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_layoutget,
+	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_layoutreturn,
+#else
 	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_notsupp,
+#endif
 	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_secinfo_no_name,
 	[OP_SEQUENCE]		= (nfsd4_dec)nfsd4_decode_sequence,
 	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2548,6 +2678,30 @@ out_acl:
 			get_parent_attributes(exp, &stat);
 		p = xdr_encode_hyper(p, stat.ino);
 	}
+#ifdef CONFIG_NFSD_PNFS
+	if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
+	    (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
+		if (exp->ex_layout_type) {
+			p = xdr_reserve_space(xdr, 8);
+			if (!p)
+				goto out_resource;
+			*p++ = cpu_to_be32(1);
+			*p++ = cpu_to_be32(exp->ex_layout_type);
+		} else {
+			p = xdr_reserve_space(xdr, 4);
+			if (!p)
+				goto out_resource;
+			*p++ = cpu_to_be32(0);
+		}
+	}
+
+	if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(stat.blksize);
+	}
+#endif /* CONFIG_NFSD_PNFS */
 	if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
 		status = nfsd4_encode_security_label(xdr, rqstp, context,
 								contextlen);
@@ -3824,6 +3978,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
 	return nfserr;
 }
 
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_getdeviceinfo *gdev)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	const struct nfsd4_layout_ops *ops =
+		nfsd4_layout_ops[gdev->gd_layout_type];
+	u32 starting_len = xdr->buf->len, needed_len;
+	__be32 *p;
+
+	dprintk("%s: err %d\n", __func__, nfserr);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_resource;
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		goto out;
+
+	*p++ = cpu_to_be32(gdev->gd_layout_type);
+
+	/* If maxcount is 0 then just update notifications */
+	if (gdev->gd_maxcount != 0) {
+		nfserr = ops->encode_getdeviceinfo(xdr, gdev);
+		if (nfserr) {
+			/*
+			 * We don't bother to burden the layout drivers with
+			 * enforcing gd_maxcount, just tell the client to
+			 * come back with a bigger buffer if it's not enough.
+			 */
+			if (xdr->buf->len + 4 > gdev->gd_maxcount)
+				goto toosmall;
+			goto out;
+		}
+	}
+
+	nfserr = nfserr_resource;
+	if (gdev->gd_notify_types) {
+		p = xdr_reserve_space(xdr, 4 + 4);
+		if (!p)
+			goto out;
+		*p++ = cpu_to_be32(1);			/* bitmap length */
+		*p++ = cpu_to_be32(gdev->gd_notify_types);
+	} else {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out;
+		*p++ = 0;
+	}
+
+	nfserr = 0;
+out:
+	kfree(gdev->gd_device);
+	dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr));
+	return nfserr;
+
+toosmall:
+	dprintk("%s: maxcount too small\n", __func__);
+	needed_len = xdr->buf->len + 4 /* notifications */;
+	xdr_truncate_encode(xdr, starting_len);
+	p = xdr_reserve_space(xdr, 4);
+	if (!p) {
+		nfserr = nfserr_resource;
+	} else {
+		*p++ = cpu_to_be32(needed_len);
+		nfserr = nfserr_toosmall;
+	}
+	goto out;
+}
+
+static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_layoutget *lgp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	const struct nfsd4_layout_ops *ops =
+		nfsd4_layout_ops[lgp->lg_layout_type];
+	__be32 *p;
+
+	dprintk("%s: err %d\n", __func__, nfserr);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_resource;
+	p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
+	if (!p)
+		goto out;
+
+	*p++ = cpu_to_be32(1);	/* we always set return-on-close */
+	*p++ = cpu_to_be32(lgp->lg_sid.si_generation);
+	p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
+				    sizeof(stateid_opaque_t));
+
+	*p++ = cpu_to_be32(1);	/* we always return a single layout */
+	p = xdr_encode_hyper(p, lgp->lg_seg.offset);
+	p = xdr_encode_hyper(p, lgp->lg_seg.length);
+	*p++ = cpu_to_be32(lgp->lg_seg.iomode);
+	*p++ = cpu_to_be32(lgp->lg_layout_type);
+
+	nfserr = ops->encode_layoutget(xdr, lgp);
+out:
+	kfree(lgp->lg_content);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+			  struct nfsd4_layoutcommit *lcp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(lcp->lc_size_chg);
+	if (lcp->lc_size_chg) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			return nfserr_resource;
+		p = xdr_encode_hyper(p, lcp->lc_newsize);
+	}
+
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(lrp->lrs_present);
+	if (lrp->lrs_present)
+		nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+	return nfs_ok;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 static __be32
 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
 		  struct nfsd4_seek *seek)
@@ -3900,11 +4204,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_getdeviceinfo,
+	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_layoutcommit,
+	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_layoutget,
+	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_layoutreturn,
+#else
 	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+#endif
 	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_secinfo_no_name,
 	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
 	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 19ace74d35f6..aa47d75ddb26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -21,6 +21,7 @@
 #include "cache.h"
 #include "state.h"
 #include "netns.h"
+#include "pnfs.h"
 
 /*
  *	We have a single directory with several nodes in it.
@@ -1258,9 +1259,12 @@ static int __init init_nfsd(void)
 	retval = nfsd4_init_slabs();
 	if (retval)
 		goto out_unregister_pernet;
-	retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+	retval = nfsd4_init_pnfs();
 	if (retval)
 		goto out_free_slabs;
+	retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+	if (retval)
+		goto out_exit_pnfs;
 	nfsd_stat_init();	/* Statistics */
 	retval = nfsd_reply_cache_init();
 	if (retval)
@@ -1282,6 +1286,8 @@ out_free_lockd:
 out_free_stat:
 	nfsd_stat_shutdown();
 	nfsd_fault_inject_cleanup();
+out_exit_pnfs:
+	nfsd4_exit_pnfs();
 out_free_slabs:
 	nfsd4_free_slabs();
 out_unregister_pernet:
@@ -1299,6 +1305,7 @@ static void __exit exit_nfsd(void)
 	nfsd_stat_shutdown();
 	nfsd_lockd_shutdown();
 	nfsd4_free_slabs();
+	nfsd4_exit_pnfs();
 	nfsd_fault_inject_cleanup();
 	unregister_filesystem(&nfsd_fs_type);
 	unregister_pernet_subsys(&nfsd_net_ops);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 33a46a8dfaf7..565c4da1a9eb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -325,15 +325,27 @@ void		nfsd_lockd_shutdown(void);
 
 #define NFSD4_SUPPORTED_ATTRS_WORD2 0
 
+/* 4.1 */
+#ifdef CONFIG_NFSD_PNFS
+#define PNFSD_SUPPORTED_ATTRS_WORD1	FATTR4_WORD1_FS_LAYOUT_TYPES
+#define PNFSD_SUPPORTED_ATTRS_WORD2 \
+(FATTR4_WORD2_LAYOUT_BLKSIZE	| FATTR4_WORD2_LAYOUT_TYPES)
+#else
+#define PNFSD_SUPPORTED_ATTRS_WORD1	0
+#define PNFSD_SUPPORTED_ATTRS_WORD2	0
+#endif /* CONFIG_NFSD_PNFS */
+
 #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
 	NFSD4_SUPPORTED_ATTRS_WORD0
 
 #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
-	NFSD4_SUPPORTED_ATTRS_WORD1
+	(NFSD4_SUPPORTED_ATTRS_WORD1	| PNFSD_SUPPORTED_ATTRS_WORD1)
 
 #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
-	(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+	(NFSD4_SUPPORTED_ATTRS_WORD2	| PNFSD_SUPPORTED_ATTRS_WORD2 | \
+	 FATTR4_WORD2_SUPPATTR_EXCLCREAT)
 
+/* 4.2 */
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #define NFSD4_2_SECURITY_ATTRS		FATTR4_WORD2_SECURITY_LABEL
 #else
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
new file mode 100644
index 000000000000..a9616a4e13cd
--- /dev/null
+++ b/fs/nfsd/pnfs.h
@@ -0,0 +1,80 @@
+#ifndef _FS_NFSD_PNFS_H
+#define _FS_NFSD_PNFS_H 1
+
+#include <linux/exportfs.h>
+#include <linux/nfsd/export.h>
+
+#include "state.h"
+#include "xdr4.h"
+
+struct xdr_stream;
+
+struct nfsd4_deviceid_map {
+	struct list_head	hash;
+	u64			idx;
+	int			fsid_type;
+	u32			fsid[];
+};
+
+struct nfsd4_layout_ops {
+	u32		notify_types;
+
+	__be32 (*proc_getdeviceinfo)(struct super_block *sb,
+			struct nfsd4_getdeviceinfo *gdevp);
+	__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
+			struct nfsd4_getdeviceinfo *gdevp);
+
+	__be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
+			struct nfsd4_layoutget *lgp);
+	__be32 (*encode_layoutget)(struct xdr_stream *,
+			struct nfsd4_layoutget *lgp);
+
+	__be32 (*proc_layoutcommit)(struct inode *inode,
+			struct nfsd4_layoutcommit *lcp);
+};
+
+extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+
+__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate, stateid_t *stateid,
+		bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
+__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
+		struct nfs4_layout_stateid *ls);
+__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp);
+__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp);
+int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+		u32 device_generation);
+struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
+
+#ifdef CONFIG_NFSD_PNFS
+void nfsd4_setup_layout_type(struct svc_export *exp);
+void nfsd4_return_all_client_layouts(struct nfs4_client *);
+void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+		struct nfs4_file *fp);
+int nfsd4_init_pnfs(void);
+void nfsd4_exit_pnfs(void);
+#else
+static inline void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+}
+
+static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+}
+static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+		struct nfs4_file *fp)
+{
+}
+static inline void nfsd4_exit_pnfs(void)
+{
+}
+static inline int nfsd4_init_pnfs(void)
+{
+	return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _FS_NFSD_PNFS_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 38ebb1268b59..5f66b7fd0297 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -92,6 +92,7 @@ struct nfs4_stid {
 /* For a deleg stateid kept around only to process free_stateid's: */
 #define NFS4_REVOKED_DELEG_STID 16
 #define NFS4_CLOSED_DELEG_STID 32
+#define NFS4_LAYOUT_STID 64
 	unsigned char sc_type;
 	stateid_t sc_stateid;
 	struct nfs4_client *sc_client;
@@ -297,6 +298,9 @@ struct nfs4_client {
 	struct list_head	cl_delegations;
 	struct list_head	cl_revoked;	/* unacknowledged, revoked 4.1 state */
 	struct list_head        cl_lru;         /* tail queue */
+#ifdef CONFIG_NFSD_PNFS
+	struct list_head	cl_lo_states;	/* outstanding layout states */
+#endif
 	struct xdr_netobj	cl_name; 	/* id generated by client */
 	nfs4_verifier		cl_verifier; 	/* generated by client */
 	time_t                  cl_time;        /* time of last lease renewal */
@@ -496,6 +500,9 @@ struct nfs4_file {
 	int			fi_delegees;
 	struct knfsd_fh		fi_fhandle;
 	bool			fi_had_conflict;
+#ifdef CONFIG_NFSD_PNFS
+	struct list_head	fi_lo_states;
+#endif
 };
 
 /*
@@ -528,6 +535,20 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
 	return container_of(s, struct nfs4_ol_stateid, st_stid);
 }
 
+struct nfs4_layout_stateid {
+	struct nfs4_stid		ls_stid;
+	struct list_head		ls_perclnt;
+	struct list_head		ls_perfile;
+	spinlock_t			ls_lock;
+	struct list_head		ls_layouts;
+	u32				ls_layout_type;
+};
+
+static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
+{
+	return container_of(s, struct nfs4_layout_stateid, ls_stid);
+}
+
 /* flags for preprocess_seqid_op() */
 #define RD_STATE	        0x00000010
 #define WR_STATE	        0x00000020
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 90a5925bd6ab..0bda93e58e1b 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,61 @@ struct nfsd4_reclaim_complete {
 	u32 rca_one_fs;
 };
 
+struct nfsd4_deviceid {
+	u64			fsid_idx;
+	u32			generation;
+	u32			pad;
+};
+
+struct nfsd4_layout_seg {
+	u32			iomode;
+	u64			offset;
+	u64			length;
+};
+
+struct nfsd4_getdeviceinfo {
+	struct nfsd4_deviceid	gd_devid;	/* request */
+	u32			gd_layout_type;	/* request */
+	u32			gd_maxcount;	/* request */
+	u32			gd_notify_types;/* request - response */
+	void			*gd_device;	/* response */
+};
+
+struct nfsd4_layoutget {
+	u64			lg_minlength;	/* request */
+	u32			lg_signal;	/* request */
+	u32			lg_layout_type;	/* request */
+	u32			lg_maxcount;	/* request */
+	stateid_t		lg_sid;		/* request/response */
+	struct nfsd4_layout_seg	lg_seg;		/* request/response */
+	void			*lg_content;	/* response */
+};
+
+struct nfsd4_layoutcommit {
+	stateid_t		lc_sid;		/* request */
+	struct nfsd4_layout_seg	lc_seg;		/* request */
+	u32			lc_reclaim;	/* request */
+	u32			lc_newoffset;	/* request */
+	u64			lc_last_wr;	/* request */
+	struct timespec		lc_mtime;	/* request */
+	u32			lc_layout_type;	/* request */
+	u32			lc_up_len;	/* layout length */
+	void			*lc_up_layout;	/* decoded by callback */
+	u32			lc_size_chg;	/* boolean for response */
+	u64			lc_newsize;	/* response */
+};
+
+struct nfsd4_layoutreturn {
+	u32			lr_return_type;	/* request */
+	u32			lr_layout_type;	/* request */
+	struct nfsd4_layout_seg	lr_seg;		/* request */
+	u32			lr_reclaim;	/* request */
+	u32			lrf_body_len;	/* request */
+	void			*lrf_body;	/* request */
+	stateid_t		lr_sid;		/* request/response */
+	u32			lrs_present;	/* response */
+};
+
 struct nfsd4_fallocate {
 	/* request */
 	stateid_t	falloc_stateid;
@@ -491,6 +546,10 @@ struct nfsd4_op {
 		struct nfsd4_reclaim_complete	reclaim_complete;
 		struct nfsd4_test_stateid	test_stateid;
 		struct nfsd4_free_stateid	free_stateid;
+		struct nfsd4_getdeviceinfo	getdeviceinfo;
+		struct nfsd4_layoutget		layoutget;
+		struct nfsd4_layoutcommit	layoutcommit;
+		struct nfsd4_layoutreturn	layoutreturn;
 
 		/* NFSv4.2 */
 		struct nfsd4_fallocate		allocate;
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 8a3589c2542c..bc10d687f2ce 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -411,6 +411,7 @@ enum lock_type4 {
 #define FATTR4_WORD1_TIME_MODIFY_SET    (1UL << 22)
 #define FATTR4_WORD1_MOUNTED_ON_FILEID  (1UL << 23)
 #define FATTR4_WORD1_FS_LAYOUT_TYPES    (1UL << 30)
+#define FATTR4_WORD2_LAYOUT_TYPES       (1UL << 0)
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 #define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
 #define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
diff --git a/include/uapi/linux/nfsd/debug.h b/include/uapi/linux/nfsd/debug.h
index 1fdc95bb2375..0bf130a1c58d 100644
--- a/include/uapi/linux/nfsd/debug.h
+++ b/include/uapi/linux/nfsd/debug.h
@@ -32,6 +32,7 @@
 #define NFSDDBG_REPCACHE	0x0080
 #define NFSDDBG_XDR		0x0100
 #define NFSDDBG_LOCKD		0x0200
+#define NFSDDBG_PNFS		0x0400
 #define NFSDDBG_ALL		0x7FFF
 #define NFSDDBG_NOCHANGE	0xFFFF
 
diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h
index 584b6ef3a5e8..4742f2cb42f2 100644
--- a/include/uapi/linux/nfsd/export.h
+++ b/include/uapi/linux/nfsd/export.h
@@ -47,8 +47,10 @@
  * exported filesystem.
  */
 #define	NFSEXP_V4ROOT		0x10000
+#define NFSEXP_NOPNFS		0x20000
+
 /* All flags that we claim to support.  (Note we don't support NOACL.) */
-#define NFSEXP_ALLFLAGS		0x1FE7F
+#define NFSEXP_ALLFLAGS		0x3FE7F
 
 /* The flags that may vary depending on security flavor: */
 #define NFSEXP_SECINFO_FLAGS	(NFSEXP_READONLY | NFSEXP_ROOTSQUASH \
-- 
cgit v1.2.3


From 67dc0d4758e5ced978bb30e8af11bc42cabfa77c Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 29 Jan 2015 14:11:25 +1000
Subject: vt_buffer: drop console buffer copying optimisations

These two copy to/from VGA memory, however on the Silicon
Motion SMI750 VGA card on a 64-bit system cause console corruption.

This is due to the hw being buggy and not handling a 64-bit transaction
correctly.

We could try and create a 32-bit version of these routines,
but I'm not sure the optimisation is worth much today.

Fixes https://bugzilla.redhat.com/show_bug.cgi?id=1132826

Tested-by: Huawei engineering.
Signed-off-by: Dave Airlie <airlied@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/vt_buffer.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vt_buffer.h b/include/linux/vt_buffer.h
index 057db7d2f448..f38c10ba3ff5 100644
--- a/include/linux/vt_buffer.h
+++ b/include/linux/vt_buffer.h
@@ -21,10 +21,6 @@
 #ifndef VT_BUF_HAVE_RW
 #define scr_writew(val, addr) (*(addr) = (val))
 #define scr_readw(addr) (*(addr))
-#define scr_memcpyw(d, s, c) memcpy(d, s, c)
-#define scr_memmovew(d, s, c) memmove(d, s, c)
-#define VT_BUF_HAVE_MEMCPYW
-#define VT_BUF_HAVE_MEMMOVEW
 #endif
 
 #ifndef VT_BUF_HAVE_MEMSETW
-- 
cgit v1.2.3


From 01395d798452435f19de3bfe5d04325db4e49677 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Thu, 22 Jan 2015 11:50:24 -0500
Subject: PNP: Allow console to override ACPI device sleep

If the serial console is an ACPI PNP device, the PNP bus always powers
down the device at system suspend, even though the no_console_suspend
command line parameter is specified (eg., when debugging suspend/resume).

Add PNP_CONSOLE capability, which when set, prevents calling both the
->disable() and ->suspend() PNP protocol methods if console suspend
is disabled.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pnp/driver.c |  2 +-
 include/linux/pnp.h  | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c
index f748cc8cbb03..4e57d3370368 100644
--- a/drivers/pnp/driver.c
+++ b/drivers/pnp/driver.c
@@ -182,7 +182,7 @@ static int __pnp_bus_suspend(struct device *dev, pm_message_t state)
 			return error;
 	}
 
-	if (pnp_dev->protocol->suspend)
+	if (pnp_can_suspend(pnp_dev))
 		pnp_dev->protocol->suspend(pnp_dev, state);
 	return 0;
 }
diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index 195aafc6cd07..6512e9cbc6d5 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -12,6 +12,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/mod_devicetable.h>
+#include <linux/console.h>
 
 #define PNP_NAME_LEN		50
 
@@ -309,15 +310,22 @@ struct pnp_fixup {
 #define PNP_DISABLE		0x0004
 #define PNP_CONFIGURABLE	0x0008
 #define PNP_REMOVABLE		0x0010
+#define PNP_CONSOLE		0x0020
 
 #define pnp_can_read(dev)	(((dev)->protocol->get) && \
 				 ((dev)->capabilities & PNP_READ))
 #define pnp_can_write(dev)	(((dev)->protocol->set) && \
 				 ((dev)->capabilities & PNP_WRITE))
-#define pnp_can_disable(dev)	(((dev)->protocol->disable) && \
-				 ((dev)->capabilities & PNP_DISABLE))
+#define pnp_can_disable(dev)	(((dev)->protocol->disable) &&		  \
+				 ((dev)->capabilities & PNP_DISABLE) &&	  \
+				 (!((dev)->capabilities & PNP_CONSOLE) || \
+				  console_suspend_enabled))
 #define pnp_can_configure(dev)	((!(dev)->active) && \
 				 ((dev)->capabilities & PNP_CONFIGURABLE))
+#define pnp_can_suspend(dev)	(((dev)->protocol->suspend) &&		  \
+				 (!((dev)->capabilities & PNP_CONSOLE) || \
+				  console_suspend_enabled))
+
 
 #ifdef CONFIG_ISAPNP
 extern struct pnp_protocol isapnp_protocol;
-- 
cgit v1.2.3


From 3abf87cd3e70009ed70a7f28c2914888a7e27332 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 17 Jan 2015 15:42:04 -0500
Subject: tty: Make lock subclasses available for other tty locks

Besides nested legacy_mutex locking which is required on pty pair
teardown, other nested pty operations require lock subclassing.

Move lock subclass definition to tty interface header, include/linux/tty.h,
and document its use.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_mutex.c | 12 +-----------
 include/linux/tty.h     | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_mutex.c b/drivers/tty/tty_mutex.c
index a872389dc0bc..0efcf713b756 100644
--- a/drivers/tty/tty_mutex.c
+++ b/drivers/tty/tty_mutex.c
@@ -4,18 +4,8 @@
 #include <linux/semaphore.h>
 #include <linux/sched.h>
 
-/*
- * Nested tty locks are necessary for releasing pty pairs.
- * The stable lock order is master pty first, then slave pty.
- */
-
 /* Legacy tty mutex glue */
 
-enum {
-	TTY_MUTEX_NORMAL,
-	TTY_MUTEX_SLAVE,
-};
-
 /*
  * Getting the big tty mutex.
  */
@@ -58,5 +48,5 @@ void __lockfunc tty_unlock_slave(struct tty_struct *tty)
 
 void tty_set_lock_subclass(struct tty_struct *tty)
 {
-	lockdep_set_subclass(&tty->legacy_mutex, TTY_MUTEX_SLAVE);
+	lockdep_set_subclass(&tty->legacy_mutex, TTY_LOCK_SLAVE);
 }
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 7d66ae508e5c..849659908f23 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -14,6 +14,23 @@
 #include <linux/llist.h>
 
 
+/*
+ * Lock subclasses for tty locks
+ *
+ * TTY_LOCK_NORMAL is for normal ttys and master ptys.
+ * TTY_LOCK_SLAVE is for slave ptys only.
+ *
+ * Lock subclasses are necessary for handling nested locking with pty pairs.
+ * tty locks which use nested locking:
+ *
+ * legacy_mutex - Nested tty locks are necessary for releasing pty pairs.
+ *		  The stable lock order is master pty first, then slave pty.
+ */
+
+enum {
+	TTY_LOCK_NORMAL = 0,
+	TTY_LOCK_SLAVE,
+};
 
 /*
  * (Note: the *_driver.minor_start values 1, 64, 128, 192 are
-- 
cgit v1.2.3


From 1d1d14da12e79a6c05fbe1a975401f0f56c93316 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 17 Jan 2015 15:42:05 -0500
Subject: pty: Fix buffer flush deadlock

The pty driver does not clear its write buffer when commanded.
This is to avoid an apparent deadlock between parallel flushes from
both pty ends; specifically when handling either BRK or INTR input.
However, parallel flushes from this source is not possible since
the pty master can never be set to BRKINT or ISIG. Parallel flushes
from other sources are possible but these do not threaten deadlocks.

Annotate the tty buffer mutex for lockdep to represent the nested
tty_buffer locking which occurs when the pty slave is processing input
(its buffer mutex held) and receives INTR or BRK and acquires the
linked tty buffer mutex via tty_buffer_flush().

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/pty.c        | 10 +++++++++-
 drivers/tty/tty_buffer.c |  6 ++++++
 include/linux/tty.h      |  4 ++++
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index f882ac81b93a..0e273158edac 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -212,10 +212,16 @@ static int pty_signal(struct tty_struct *tty, int sig)
 static void pty_flush_buffer(struct tty_struct *tty)
 {
 	struct tty_struct *to = tty->link;
+	struct tty_ldisc *ld;
 
 	if (!to)
 		return;
-	/* tty_buffer_flush(to); FIXME */
+
+	ld = tty_ldisc_ref(to);
+	tty_buffer_flush(to, ld);
+	if (ld)
+		tty_ldisc_deref(ld);
+
 	if (to->packet) {
 		spin_lock_irq(&tty->ctrl_lock);
 		tty->ctrl_status |= TIOCPKT_FLUSHWRITE;
@@ -425,6 +431,8 @@ static int pty_common_install(struct tty_driver *driver, struct tty_struct *tty,
 	tty->port = ports[1];
 	o_tty->port->itty = o_tty;
 
+	tty_buffer_set_lock_subclass(o_tty->port);
+
 	tty_driver_kref_get(driver);
 	tty->count++;
 	o_tty->count++;
diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index 3605103fc1ac..75661641f5fe 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -557,3 +557,9 @@ int tty_buffer_set_limit(struct tty_port *port, int limit)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(tty_buffer_set_limit);
+
+/* slave ptys can claim nested buffer lock when handling BRK and INTR */
+void tty_buffer_set_lock_subclass(struct tty_port *port)
+{
+	lockdep_set_subclass(&port->buf.lock, TTY_LOCK_SLAVE);
+}
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 849659908f23..55964b9490dd 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -25,6 +25,9 @@
  *
  * legacy_mutex - Nested tty locks are necessary for releasing pty pairs.
  *		  The stable lock order is master pty first, then slave pty.
+ * tty_buffer lock - slave ptys can claim nested buffer lock when handling
+ *		     signal chars. The stable lock order is slave pty, then
+ *		     master.
  */
 
 enum {
@@ -460,6 +463,7 @@ extern void tty_flush_to_ldisc(struct tty_struct *tty);
 extern void tty_buffer_free_all(struct tty_port *port);
 extern void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld);
 extern void tty_buffer_init(struct tty_port *port);
+extern void tty_buffer_set_lock_subclass(struct tty_port *port);
 extern speed_t tty_termios_baud_rate(struct ktermios *termios);
 extern speed_t tty_termios_input_baud_rate(struct ktermios *termios);
 extern void tty_termios_encode_baud_rate(struct ktermios *termios,
-- 
cgit v1.2.3


From d2b6f44779d3be22d32a5697bd30b59367fd2b33 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 17 Jan 2015 15:42:06 -0500
Subject: n_tty: Fix signal handling flushes

BRKINT and ISIG requires input and output flush when a signal char
is received. However, the order of operations is significant since
parallel i/o may be ongoing.

Merge the signal handling for BRKINT with ISIG handling.

Process the signal first. This ensures any ongoing i/o is aborted;
without this, a waiting writer may continue writing after the flush
occurs and after the signal char has been echoed.

Write lock the termios_rwsem, which excludes parallel writers from
pushing new i/o until after the output buffers are flushed; claiming
the write lock is necessary anyway to exclude parallel readers while
the read buffer is flushed.

Subclass the termios_rwsem for ptys since the slave pty performing
the flush may appear to reorder the termios_rwsem->tty buffer lock
lock order; adding annotation clarifies that
  slave tty_buffer lock-> slave termios_rwsem -> master tty_buffer lock
is a valid lock order.

Flush the echo buffer. In this context, the echo buffer is 'output'.
Otherwise, the output will appear discontinuous because the output buffer
was cleared which contains older output than the echo buffer.

Open-code the read buffer flush since the input worker does not need
kicking (this is the input worker).

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/n_tty.c | 45 ++++++++++++++++++++++++++++++---------------
 drivers/tty/pty.c   |  1 +
 include/linux/tty.h |  3 +++
 3 files changed, 34 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 5793aa0d3bfb..cf6e0f2e1331 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -1090,16 +1090,45 @@ static void eraser(unsigned char c, struct tty_struct *tty)
  *	Called when a signal is being sent due to terminal input.
  *	Called from the driver receive_buf path so serialized.
  *
+ *	Performs input and output flush if !NOFLSH. In this context, the echo
+ *	buffer is 'output'. The signal is processed first to alert any current
+ *	readers or writers to discontinue and exit their i/o loops.
+ *
  *	Locking: ctrl_lock
  */
 
 static void isig(int sig, struct tty_struct *tty)
 {
+	struct n_tty_data *ldata = tty->disc_data;
 	struct pid *tty_pgrp = tty_get_pgrp(tty);
 	if (tty_pgrp) {
 		kill_pgrp(tty_pgrp, sig, 1);
 		put_pid(tty_pgrp);
 	}
+
+	if (!L_NOFLSH(tty)) {
+		up_read(&tty->termios_rwsem);
+		down_write(&tty->termios_rwsem);
+
+		/* clear echo buffer */
+		mutex_lock(&ldata->output_lock);
+		ldata->echo_head = ldata->echo_tail = 0;
+		ldata->echo_mark = ldata->echo_commit = 0;
+		mutex_unlock(&ldata->output_lock);
+
+		/* clear output buffer */
+		tty_driver_flush_buffer(tty);
+
+		/* clear input buffer */
+		reset_buffer_flags(tty->disc_data);
+
+		/* notify pty master of flush */
+		if (tty->link)
+			n_tty_packet_mode_flush(tty);
+
+		up_write(&tty->termios_rwsem);
+		down_read(&tty->termios_rwsem);
+	}
 }
 
 /**
@@ -1123,13 +1152,6 @@ static void n_tty_receive_break(struct tty_struct *tty)
 		return;
 	if (I_BRKINT(tty)) {
 		isig(SIGINT, tty);
-		if (!L_NOFLSH(tty)) {
-			/* flushing needs exclusive termios_rwsem */
-			up_read(&tty->termios_rwsem);
-			n_tty_flush_buffer(tty);
-			tty_driver_flush_buffer(tty);
-			down_read(&tty->termios_rwsem);
-		}
 		return;
 	}
 	if (I_PARMRK(tty)) {
@@ -1203,13 +1225,7 @@ static void n_tty_receive_parity_error(struct tty_struct *tty, unsigned char c)
 static void
 n_tty_receive_signal_char(struct tty_struct *tty, int signal, unsigned char c)
 {
-	if (!L_NOFLSH(tty)) {
-		/* flushing needs exclusive termios_rwsem */
-		up_read(&tty->termios_rwsem);
-		n_tty_flush_buffer(tty);
-		tty_driver_flush_buffer(tty);
-		down_read(&tty->termios_rwsem);
-	}
+	isig(signal, tty);
 	if (I_IXON(tty))
 		start_tty(tty);
 	if (L_ECHO(tty)) {
@@ -1217,7 +1233,6 @@ n_tty_receive_signal_char(struct tty_struct *tty, int signal, unsigned char c)
 		commit_echoes(tty);
 	} else
 		process_echoes(tty);
-	isig(signal, tty);
 	return;
 }
 
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 0e273158edac..e72ee629cead 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -395,6 +395,7 @@ static int pty_common_install(struct tty_driver *driver, struct tty_struct *tty,
 		goto err_put_module;
 
 	tty_set_lock_subclass(o_tty);
+	lockdep_set_subclass(&o_tty->termios_rwsem, TTY_LOCK_SLAVE);
 
 	if (legacy) {
 		/* We always use new tty termios data so we can do this
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 55964b9490dd..fe5623c9af71 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -25,6 +25,9 @@
  *
  * legacy_mutex - Nested tty locks are necessary for releasing pty pairs.
  *		  The stable lock order is master pty first, then slave pty.
+ * termios_rwsem - The stable lock order is tty_buffer lock->termios_rwsem.
+ *		   Subclassing this lock enables the slave pty to hold its
+ *		   termios_rwsem when claiming the master tty_buffer lock.
  * tty_buffer lock - slave ptys can claim nested buffer lock when handling
  *		     signal chars. The stable lock order is slave pty, then
  *		     master.
-- 
cgit v1.2.3


From 4516d50aabedbe5ae334155193e4d35c02390d9a Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Thu, 22 Jan 2015 12:24:30 -0500
Subject: serial: 8250: Use canary to restart console after suspend

When using no_console_suspend, the serial console may be powered off
anyway during system sleep. Upon resume, the port may be in its default
power-on state, but is expected to continue console i/o before the device
has received its pm callback. The resultant garbage i/o can cause all
kinds of havoc on the remote end.

Use the scratch register as a canary to discover if the console
has been powered-off. Write a non-zero value to the scratch register
at port suspend and reprogram the port before any console i/o if the
scratch register != canary before port resume.

This workaround is disabled for omap_8250 (which uses different divisor
programming).

Credit to Doug Anderson <dianders@chromium.org> for the idea of using
the scratch register canary to discover port power-down.

Cc: Doug Anderson <dianders@chromium.org>
Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_core.c | 35 ++++++++++++++++++++++++++++++++++-
 include/linux/serial_8250.h         |  3 +++
 2 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index c8ecfaf49eb4..1449c56506b7 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -3267,6 +3267,27 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 	else
 		serial_port_out(port, UART_IER, 0);
 
+	/* check scratch reg to see if port powered off during system sleep */
+	if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
+		struct ktermios termios;
+		unsigned int baud, quot, frac = 0;
+
+		termios.c_cflag = port->cons->cflag;
+		if (port->state->port.tty && termios.c_cflag == 0)
+			termios.c_cflag = port->state->port.tty->termios.c_cflag;
+
+		baud = uart_get_baud_rate(port, &termios, NULL,
+					  port->uartclk / 16 / 0xffff,
+					  port->uartclk / 16);
+		quot = serial8250_get_divisor(up, baud, &frac);
+
+		serial8250_set_divisor(port, baud, quot, frac);
+		serial_port_out(port, UART_LCR, up->lcr);
+		serial_port_out(port, UART_MCR, UART_MCR_DTR | UART_MCR_RTS);
+
+		up->canary = 0;
+	}
+
 	uart_console_write(port, s, count, serial8250_console_putchar);
 
 	/*
@@ -3417,7 +3438,17 @@ int __init early_serial_setup(struct uart_port *port)
  */
 void serial8250_suspend_port(int line)
 {
-	uart_suspend_port(&serial8250_reg, &serial8250_ports[line].port);
+	struct uart_8250_port *up = &serial8250_ports[line];
+	struct uart_port *port = &up->port;
+
+	if (!console_suspend_enabled && uart_console(port) &&
+	    port->type != PORT_8250) {
+		unsigned char canary = 0xa5;
+		serial_out(up, UART_SCR, canary);
+		up->canary = canary;
+	}
+
+	uart_suspend_port(&serial8250_reg, port);
 }
 
 /**
@@ -3431,6 +3462,8 @@ void serial8250_resume_port(int line)
 	struct uart_8250_port *up = &serial8250_ports[line];
 	struct uart_port *port = &up->port;
 
+	up->canary = 0;
+
 	if (up->capabilities & UART_NATSEMI) {
 		/* Ensure it's still in high speed mode */
 		serial_port_out(port, UART_LCR, 0xE0);
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 245b959f1ff6..a8efa235b7c1 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -85,6 +85,9 @@ struct uart_8250_port {
 	unsigned char		mcr_force;	/* mask of forced bits */
 	unsigned char		cur_iotype;	/* Running I/O type */
 	unsigned int		rpm_tx_active;
+	unsigned char		canary;		/* non-zero during system sleep
+						 *   if no_console_suspend
+						 */
 
 	/*
 	 * Some bits in registers are cleared on a read, so they must
-- 
cgit v1.2.3


From 391f93f2ec9f857c83bdd21a14dcf7e699f38579 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sun, 25 Jan 2015 14:44:51 -0500
Subject: serial: core: Rework hw-assisted flow control support

hw-assisted flow control support was added to the serial core
in v3.8 with commits,
dba05832cbe4f ("SERIAL: core: add hardware assisted h/w flow control support")
2cbacafd7af0f ("SERIAL: core: add hardware assisted s/w flow control support")
9aba8d5b01119 ("SERIAL: core: add throttle/unthrottle callbacks for hardware
                assisted flow control")
Since then, additional requirements for serial core support have arisen.
Specifically,
1. Separate tx and rx flow control settings for UARTs which only support
   tx flow control (ie., autoCTS).
2. Disable sw-assisted CTS flow control in autoCTS mode
3. Support for RTS flow control by serial core and userspace in autoRTS mode

Distinguish mode from capability; introduce UPSTAT_AUTORTS, UPSTAT_AUTOCTS
and UPSTAT_AUTOXOFF which, when set by the uart driver, enable serial core
support for hw-assisted rx, hw-assisted tx and hw-assisted in-band/IXOFF
rx flow control, respectively. [Note: hw-assisted in-band/IXON tx flow
control does not require serial core support/intervention and can be
enabled by the uart driver when required.]

These modes must be set/reset in the driver's set_termios() method, based
on termios settings, and thus can be safely queried in any context in which
one of the port lock, port mutex or termios rwsem are held. Set these modes
in the 2 in-tree drivers, omap-serial and 8250_omap, which currently
use UPF_HARD_FLOW/UPF_SOFT_FLOW support.

Retain UPF_HARD_FLOW and UPF_SOFT_FLOW as capabilities; re-define
UPF_HARD_FLOW as both UPF_AUTO_RTS and UPF_AUTO_CTS to allow for distinct
and separate rx and tx flow control capabilities.

Disable sw-assisted CTS flow control when UPSTAT_AUTOCTS is enabled.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_omap.c |  7 +++-
 drivers/tty/serial/omap-serial.c    |  7 +++-
 drivers/tty/serial/serial_core.c    | 76 ++++++++++++++-----------------------
 include/linux/serial_core.h         | 21 ++++++++--
 4 files changed, 59 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 9191d05cfaf0..6e4ff7148ead 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -421,8 +421,11 @@ static void omap_8250_set_termios(struct uart_port *port,
 
 	priv->efr = 0;
 	up->mcr &= ~(UART_MCR_RTS | UART_MCR_XONANY);
+	up->port.status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS | UPSTAT_AUTOXOFF);
+
 	if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW) {
 		/* Enable AUTORTS and AUTOCTS */
+		up->port.status |= UPSTAT_AUTOCTS | UPSTAT_AUTORTS;
 		priv->efr |= UART_EFR_CTS | UART_EFR_RTS;
 	} else	if (up->port.flags & UPF_SOFT_FLOW) {
 		/*
@@ -438,8 +441,10 @@ static void omap_8250_set_termios(struct uart_port *port,
 		 * Enable XON/XOFF flow control on output.
 		 * Transmit XON1, XOFF1
 		 */
-		if (termios->c_iflag & IXOFF)
+		if (termios->c_iflag & IXOFF) {
+			up->port.status |= UPSTAT_AUTOXOFF;
 			priv->efr |= OMAP_UART_SW_TX;
+		}
 
 		/*
 		 * IXANY Flag:
diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index b1cf9a3b673a..6129fe515932 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -1053,8 +1053,11 @@ serial_omap_set_termios(struct uart_port *port, struct ktermios *termios,
 
 	serial_out(up, UART_TI752_TCR, OMAP_UART_TCR_TRIG);
 
+	up->port.status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS | UPSTAT_AUTOXOFF);
+
 	if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW) {
 		/* Enable AUTORTS and AUTOCTS */
+		up->port.status |= UPSTAT_AUTOCTS | UPSTAT_AUTORTS;
 		up->efr |= UART_EFR_CTS | UART_EFR_RTS;
 
 		/* Ensure MCR RTS is asserted */
@@ -1081,8 +1084,10 @@ serial_omap_set_termios(struct uart_port *port, struct ktermios *termios,
 		 * Enable XON/XOFF flow control on output.
 		 * Transmit XON1, XOFF1
 		 */
-		if (termios->c_iflag & IXOFF)
+		if (termios->c_iflag & IXOFF) {
+			up->port.status |= UPSTAT_AUTOXOFF;
 			up->efr |= OMAP_UART_SW_TX;
+		}
 
 		/*
 		 * IXANY Flag:
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 2c67a077042a..6a1055ae3437 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -179,14 +179,6 @@ static int uart_port_startup(struct tty_struct *tty, struct uart_state *state,
 			if (tty->termios.c_cflag & CBAUD)
 				uart_set_mctrl(uport, TIOCM_RTS | TIOCM_DTR);
 		}
-
-		spin_lock_irq(&uport->lock);
-		if (uart_cts_enabled(uport) &&
-		    !(uport->ops->get_mctrl(uport) & TIOCM_CTS))
-			uport->hw_stopped = 1;
-		else
-			uport->hw_stopped = 0;
-		spin_unlock_irq(&uport->lock);
 	}
 
 	/*
@@ -442,6 +434,7 @@ static void uart_change_speed(struct tty_struct *tty, struct uart_state *state,
 {
 	struct uart_port *uport = state->uart_port;
 	struct ktermios *termios;
+	int hw_stopped;
 
 	/*
 	 * If we have no tty, termios, or the port does not exist,
@@ -466,6 +459,18 @@ static void uart_change_speed(struct tty_struct *tty, struct uart_state *state,
 		uport->status &= ~UPSTAT_DCD_ENABLE;
 	else
 		uport->status |= UPSTAT_DCD_ENABLE;
+
+	/* reset sw-assisted CTS flow control based on (possibly) new mode */
+	hw_stopped = uport->hw_stopped;
+	uport->hw_stopped = uart_softcts_mode(uport) &&
+				!(uport->ops->get_mctrl(uport) & TIOCM_CTS);
+	if (uport->hw_stopped) {
+		if (!hw_stopped)
+			uport->ops->stop_tx(uport);
+	} else {
+		if (hw_stopped)
+			__uart_start(tty);
+	}
 	spin_unlock_irq(&uport->lock);
 }
 
@@ -619,22 +624,22 @@ static void uart_throttle(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port = state->uart_port;
-	upf_t mask = 0;
+	upstat_t mask = 0;
 
 	if (I_IXOFF(tty))
-		mask |= UPF_SOFT_FLOW;
+		mask |= UPSTAT_AUTOXOFF;
 	if (tty->termios.c_cflag & CRTSCTS)
-		mask |= UPF_HARD_FLOW;
+		mask |= UPSTAT_AUTORTS;
 
-	if (port->flags & mask) {
+	if (port->status & mask) {
 		port->ops->throttle(port);
-		mask &= ~port->flags;
+		mask &= ~port->status;
 	}
 
-	if (mask & UPF_SOFT_FLOW)
+	if (mask & UPSTAT_AUTOXOFF)
 		uart_send_xchar(tty, STOP_CHAR(tty));
 
-	if (mask & UPF_HARD_FLOW)
+	if (mask & UPSTAT_AUTORTS)
 		uart_clear_mctrl(port, TIOCM_RTS);
 }
 
@@ -642,22 +647,22 @@ static void uart_unthrottle(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port = state->uart_port;
-	upf_t mask = 0;
+	upstat_t mask = 0;
 
 	if (I_IXOFF(tty))
-		mask |= UPF_SOFT_FLOW;
+		mask |= UPSTAT_AUTOXOFF;
 	if (tty->termios.c_cflag & CRTSCTS)
-		mask |= UPF_HARD_FLOW;
+		mask |= UPSTAT_AUTORTS;
 
-	if (port->flags & mask) {
+	if (port->status & mask) {
 		port->ops->unthrottle(port);
-		mask &= ~port->flags;
+		mask &= ~port->status;
 	}
 
-	if (mask & UPF_SOFT_FLOW)
+	if (mask & UPSTAT_AUTOXOFF)
 		uart_send_xchar(tty, START_CHAR(tty));
 
-	if (mask & UPF_HARD_FLOW)
+	if (mask & UPSTAT_AUTORTS)
 		uart_set_mctrl(port, TIOCM_RTS);
 }
 
@@ -1351,30 +1356,6 @@ static void uart_set_termios(struct tty_struct *tty,
 			mask |= TIOCM_RTS;
 		uart_set_mctrl(uport, mask);
 	}
-
-	/*
-	 * If the port is doing h/w assisted flow control, do nothing.
-	 * We assume that port->hw_stopped has never been set.
-	 */
-	if (uport->flags & UPF_HARD_FLOW)
-		return;
-
-	/* Handle turning off CRTSCTS */
-	if ((old_termios->c_cflag & CRTSCTS) && !(cflag & CRTSCTS)) {
-		spin_lock_irq(&uport->lock);
-		uport->hw_stopped = 0;
-		__uart_start(tty);
-		spin_unlock_irq(&uport->lock);
-	}
-	/* Handle turning on CRTSCTS */
-	else if (!(old_termios->c_cflag & CRTSCTS) && (cflag & CRTSCTS)) {
-		spin_lock_irq(&uport->lock);
-		if (!(uport->ops->get_mctrl(uport) & TIOCM_CTS)) {
-			uport->hw_stopped = 1;
-			uport->ops->stop_tx(uport);
-		}
-		spin_unlock_irq(&uport->lock);
-	}
 }
 
 /*
@@ -2855,7 +2836,7 @@ void uart_handle_cts_change(struct uart_port *uport, unsigned int status)
 
 	uport->icount.cts++;
 
-	if (uart_cts_enabled(uport)) {
+	if (uart_softcts_mode(uport)) {
 		if (uport->hw_stopped) {
 			if (status) {
 				uport->hw_stopped = 0;
@@ -2868,6 +2849,7 @@ void uart_handle_cts_change(struct uart_port *uport, unsigned int status)
 				uport->ops->stop_tx(uport);
 			}
 		}
+
 	}
 }
 EXPORT_SYMBOL_GPL(uart_handle_cts_change);
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index a0c7033d5f91..baf3e1d08416 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -191,8 +191,10 @@ struct uart_port {
 #define UPF_NO_TXEN_TEST	((__force upf_t) (1 << 15))
 #define UPF_MAGIC_MULTIPLIER	((__force upf_t) ASYNC_MAGIC_MULTIPLIER /* 16 */ )
 
-/* Port has hardware-assisted h/w flow control (iow, auto-RTS *not* auto-CTS) */
-#define UPF_HARD_FLOW		((__force upf_t) (1 << 21))
+/* Port has hardware-assisted h/w flow control */
+#define UPF_AUTO_CTS		((__force upf_t) (1 << 20))
+#define UPF_AUTO_RTS		((__force upf_t) (1 << 21))
+#define UPF_HARD_FLOW		((__force upf_t) (UPF_AUTO_CTS | UPF_AUTO_RTS))
 /* Port has hardware-assisted s/w flow control */
 #define UPF_SOFT_FLOW		((__force upf_t) (1 << 22))
 #define UPF_CONS_FLOW		((__force upf_t) (1 << 23))
@@ -214,11 +216,17 @@ struct uart_port {
 #error Change mask not equivalent to userspace-visible bit defines
 #endif
 
-	/* status must be updated while holding port lock */
+	/*
+	 * Must hold termios_rwsem, port mutex and port lock to change;
+	 * can hold any one lock to read.
+	 */
 	upstat_t		status;
 
 #define UPSTAT_CTS_ENABLE	((__force upstat_t) (1 << 0))
 #define UPSTAT_DCD_ENABLE	((__force upstat_t) (1 << 1))
+#define UPSTAT_AUTORTS		((__force upstat_t) (1 << 2))
+#define UPSTAT_AUTOCTS		((__force upstat_t) (1 << 3))
+#define UPSTAT_AUTOXOFF		((__force upstat_t) (1 << 4))
 
 	int			hw_stopped;		/* sw-assisted CTS flow state */
 	unsigned int		mctrl;			/* current modem ctrl settings */
@@ -392,6 +400,13 @@ static inline bool uart_cts_enabled(struct uart_port *uport)
 	return !!(uport->status & UPSTAT_CTS_ENABLE);
 }
 
+static inline bool uart_softcts_mode(struct uart_port *uport)
+{
+	upstat_t mask = UPSTAT_CTS_ENABLE | UPSTAT_AUTOCTS;
+
+	return ((uport->status & mask) == UPSTAT_CTS_ENABLE);
+}
+
 /*
  * The following are helper functions for the low level drivers.
  */
-- 
cgit v1.2.3


From 632f32e2107d37598e3f6816dcf00c7cab4081ca Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sun, 25 Jan 2015 14:44:54 -0500
Subject: tty: Remove external interface for tty_set_termios()

tty_set_termios() is an internal helper intended for file scope use.

UART drivers which are capable of driving the RTS pin must
properly handle the tiocmset() method, regardless of termios settings.
A failure to do so is a UART driver bug and should be fixed there.
Do not use this interface to workaround UART driver bugs.

Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: <linux-bluetooth@vger.kernel.org>
Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/bluetooth/hci_ath.c | 15 ++-------------
 drivers/tty/tty_ioctl.c     |  3 +--
 include/linux/tty.h         |  1 -
 3 files changed, 3 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bluetooth/hci_ath.c b/drivers/bluetooth/hci_ath.c
index 2ff6dfd2d3f0..9c4dcf4c62ea 100644
--- a/drivers/bluetooth/hci_ath.c
+++ b/drivers/bluetooth/hci_ath.c
@@ -51,33 +51,22 @@ struct ath_struct {
 
 static int ath_wakeup_ar3k(struct tty_struct *tty)
 {
-	struct ktermios ktermios;
 	int status = tty->driver->ops->tiocmget(tty);
 
 	if (status & TIOCM_CTS)
 		return status;
 
-	/* Disable Automatic RTSCTS */
-	ktermios = tty->termios;
-	ktermios.c_cflag &= ~CRTSCTS;
-	tty_set_termios(tty, &ktermios);
-
 	/* Clear RTS first */
-	status = tty->driver->ops->tiocmget(tty);
+	tty->driver->ops->tiocmget(tty);
 	tty->driver->ops->tiocmset(tty, 0x00, TIOCM_RTS);
 	mdelay(20);
 
 	/* Set RTS, wake up board */
-	status = tty->driver->ops->tiocmget(tty);
+	tty->driver->ops->tiocmget(tty);
 	tty->driver->ops->tiocmset(tty, TIOCM_RTS, 0x00);
 	mdelay(20);
 
 	status = tty->driver->ops->tiocmget(tty);
-
-	/* Enable Automatic RTSCTS */
-	ktermios.c_cflag |= CRTSCTS;
-	status = tty_set_termios(tty, &ktermios);
-
 	return status;
 }
 
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index 1787fa4d9448..a5cf253b2544 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -530,7 +530,7 @@ EXPORT_SYMBOL(tty_termios_hw_change);
  *	Locking: termios_rwsem
  */
 
-int tty_set_termios(struct tty_struct *tty, struct ktermios *new_termios)
+static int tty_set_termios(struct tty_struct *tty, struct ktermios *new_termios)
 {
 	struct ktermios old_termios;
 	struct tty_ldisc *ld;
@@ -563,7 +563,6 @@ int tty_set_termios(struct tty_struct *tty, struct ktermios *new_termios)
 	up_write(&tty->termios_rwsem);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(tty_set_termios);
 
 /**
  *	set_termios		-	set termios values for a tty
diff --git a/include/linux/tty.h b/include/linux/tty.h
index fe5623c9af71..358a337af598 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -491,7 +491,6 @@ static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
 
 extern void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old);
 extern int tty_termios_hw_change(struct ktermios *a, struct ktermios *b);
-extern int tty_set_termios(struct tty_struct *tty, struct ktermios *kt);
 
 extern struct tty_ldisc *tty_ldisc_ref(struct tty_struct *);
 extern void tty_ldisc_deref(struct tty_ldisc *);
-- 
cgit v1.2.3


From 63e144c9d6ffa791c1402f4ee4551c1b9f5a336a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 15 Jan 2015 14:42:27 +0300
Subject: ti-st: clean up data types (fix harmless memory corruption)

The big issue here is:

	of_property_read_u32(np, "flow_cntrl", (u32 *)&dt_pdata->flow_cntrl);

"->flow_cntrl" is a char so when we write a 32 bit number to it then it
corrupts past the end of the char.  It's probably hard to notice because
the struct has padding so the code works on little endian systems. But
on a big endian system the code would fail and on a 64 bit, big endian
systems then "nshutdown_gpio" and "baud_rate" would be buggy as well.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/ti-st/st_kim.c  | 12 ++++++------
 include/linux/ti_wilink_st.h | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/ti-st/st_kim.c b/drivers/misc/ti-st/st_kim.c
index 8fb116f8a152..18e7a03985d4 100644
--- a/drivers/misc/ti-st/st_kim.c
+++ b/drivers/misc/ti-st/st_kim.c
@@ -638,7 +638,7 @@ static ssize_t show_baud_rate(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
 	struct kim_data_s *kim_data = dev_get_drvdata(dev);
-	return sprintf(buf, "%ld\n", kim_data->baud_rate);
+	return sprintf(buf, "%d\n", kim_data->baud_rate);
 }
 
 static ssize_t show_flow_cntrl(struct device *dev,
@@ -760,9 +760,9 @@ static struct ti_st_plat_data *get_platform_data(struct device *dev)
 	if (dt_property)
 		memcpy(&dt_pdata->dev_name, dt_property, len);
 	of_property_read_u32(np, "nshutdown_gpio",
-			     (u32 *)&dt_pdata->nshutdown_gpio);
-	of_property_read_u32(np, "flow_cntrl", (u32 *)&dt_pdata->flow_cntrl);
-	of_property_read_u32(np, "baud_rate", (u32 *)&dt_pdata->baud_rate);
+			     &dt_pdata->nshutdown_gpio);
+	of_property_read_u32(np, "flow_cntrl", &dt_pdata->flow_cntrl);
+	of_property_read_u32(np, "baud_rate", &dt_pdata->baud_rate);
 
 	return dt_pdata;
 }
@@ -812,14 +812,14 @@ static int kim_probe(struct platform_device *pdev)
 	kim_gdata->nshutdown = pdata->nshutdown_gpio;
 	err = gpio_request(kim_gdata->nshutdown, "kim");
 	if (unlikely(err)) {
-		pr_err(" gpio %ld request failed ", kim_gdata->nshutdown);
+		pr_err(" gpio %d request failed ", kim_gdata->nshutdown);
 		return err;
 	}
 
 	/* Configure nShutdown GPIO as output=0 */
 	err = gpio_direction_output(kim_gdata->nshutdown, 0);
 	if (unlikely(err)) {
-		pr_err(" unable to configure gpio %ld", kim_gdata->nshutdown);
+		pr_err(" unable to configure gpio %d", kim_gdata->nshutdown);
 		return err;
 	}
 	/* get reference of pdev for request_firmware
diff --git a/include/linux/ti_wilink_st.h b/include/linux/ti_wilink_st.h
index 9072d9f95cff..c78dcfeaf25f 100644
--- a/include/linux/ti_wilink_st.h
+++ b/include/linux/ti_wilink_st.h
@@ -262,7 +262,7 @@ struct kim_data_s {
 	struct completion kim_rcvd, ldisc_installed;
 	char resp_buffer[30];
 	const struct firmware *fw_entry;
-	long nshutdown;
+	unsigned nshutdown;
 	unsigned long rx_state;
 	unsigned long rx_count;
 	struct sk_buff *rx_skb;
@@ -270,8 +270,8 @@ struct kim_data_s {
 	struct chip_version version;
 	unsigned char ldisc_install;
 	unsigned char dev_name[UART_DEV_NAME_LEN + 1];
-	unsigned char flow_cntrl;
-	unsigned long baud_rate;
+	unsigned flow_cntrl;
+	unsigned baud_rate;
 };
 
 /**
@@ -437,10 +437,10 @@ struct gps_event_hdr {
  *
  */
 struct ti_st_plat_data {
-	long nshutdown_gpio;
+	u32 nshutdown_gpio;
 	unsigned char dev_name[UART_DEV_NAME_LEN]; /* uart name */
-	unsigned char flow_cntrl; /* flow control flag */
-	unsigned long baud_rate;
+	u32 flow_cntrl; /* flow control flag */
+	u32 baud_rate;
 	int (*suspend)(struct platform_device *, pm_message_t);
 	int (*resume)(struct platform_device *);
 	int (*chip_enable) (struct kim_data_s *);
-- 
cgit v1.2.3


From 2cbf7fe2d5d32a4747c1f8ad163e886dccad930c Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Tue, 3 Feb 2015 13:18:55 +0000
Subject: i2o: move to staging

The I2O layer deals with a technology that to say the least didn't catch on
in the market.

The only relevant products are some of the AMI MegaRAID - which supported I2O
and its native mode (The native mode is faster and runs on Linux), an
obscure crypto ethernet card that's now so many years out of date nobody
would use it, the old DPT controllers, which speak their own dialect and
have their own driver - and ermm.. thats about it.

We also know the code isn't in good shape as recently a patch was proposed
and queried as buggy, which in turn showed the existing code was broken
already by prior "clean up" and nobody had noticed that either.

It's coding style robot code nothing more. Like some forgotten corridor
cleaned relentlessly by a lost Roomba but where no user has trodden in years.

Move it to staging and then to /dev/null.

The headers remain as they are shared with dpt_i2o.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/Kconfig                  |    2 -
 drivers/message/Makefile         |    1 -
 drivers/message/i2o/Kconfig      |  121 ---
 drivers/message/i2o/Makefile     |   16 -
 drivers/message/i2o/README       |   98 --
 drivers/message/i2o/README.ioctl |  394 --------
 drivers/message/i2o/bus-osm.c    |  176 ----
 drivers/message/i2o/config-osm.c |   90 --
 drivers/message/i2o/core.h       |   69 --
 drivers/message/i2o/debug.c      |  472 ---------
 drivers/message/i2o/device.c     |  594 -----------
 drivers/message/i2o/driver.c     |  382 -------
 drivers/message/i2o/exec-osm.c   |  612 ------------
 drivers/message/i2o/i2o_block.c  | 1228 -----------------------
 drivers/message/i2o/i2o_block.h  |  103 --
 drivers/message/i2o/i2o_config.c | 1163 ----------------------
 drivers/message/i2o/i2o_proc.c   | 2045 --------------------------------------
 drivers/message/i2o/i2o_scsi.c   |  814 ---------------
 drivers/message/i2o/iop.c        | 1247 -----------------------
 drivers/message/i2o/memory.c     |  313 ------
 drivers/message/i2o/pci.c        |  497 ---------
 drivers/staging/Kconfig          |    2 +
 drivers/staging/Makefile         |    1 +
 drivers/staging/i2o/Kconfig      |  120 +++
 drivers/staging/i2o/Makefile     |   16 +
 drivers/staging/i2o/README       |   98 ++
 drivers/staging/i2o/README.ioctl |  394 ++++++++
 drivers/staging/i2o/bus-osm.c    |  176 ++++
 drivers/staging/i2o/config-osm.c |   90 ++
 drivers/staging/i2o/core.h       |   69 ++
 drivers/staging/i2o/debug.c      |  472 +++++++++
 drivers/staging/i2o/device.c     |  594 +++++++++++
 drivers/staging/i2o/driver.c     |  382 +++++++
 drivers/staging/i2o/exec-osm.c   |  612 ++++++++++++
 drivers/staging/i2o/i2o.h        |  988 ++++++++++++++++++
 drivers/staging/i2o/i2o_block.c  | 1228 +++++++++++++++++++++++
 drivers/staging/i2o/i2o_block.h  |  103 ++
 drivers/staging/i2o/i2o_config.c | 1163 ++++++++++++++++++++++
 drivers/staging/i2o/i2o_proc.c   | 2045 ++++++++++++++++++++++++++++++++++++++
 drivers/staging/i2o/i2o_scsi.c   |  814 +++++++++++++++
 drivers/staging/i2o/iop.c        | 1247 +++++++++++++++++++++++
 drivers/staging/i2o/memory.c     |  313 ++++++
 drivers/staging/i2o/pci.c        |  497 +++++++++
 include/linux/i2o.h              |  988 ------------------
 44 files changed, 11424 insertions(+), 11425 deletions(-)
 delete mode 100644 drivers/message/i2o/Kconfig
 delete mode 100644 drivers/message/i2o/Makefile
 delete mode 100644 drivers/message/i2o/README
 delete mode 100644 drivers/message/i2o/README.ioctl
 delete mode 100644 drivers/message/i2o/bus-osm.c
 delete mode 100644 drivers/message/i2o/config-osm.c
 delete mode 100644 drivers/message/i2o/core.h
 delete mode 100644 drivers/message/i2o/debug.c
 delete mode 100644 drivers/message/i2o/device.c
 delete mode 100644 drivers/message/i2o/driver.c
 delete mode 100644 drivers/message/i2o/exec-osm.c
 delete mode 100644 drivers/message/i2o/i2o_block.c
 delete mode 100644 drivers/message/i2o/i2o_block.h
 delete mode 100644 drivers/message/i2o/i2o_config.c
 delete mode 100644 drivers/message/i2o/i2o_proc.c
 delete mode 100644 drivers/message/i2o/i2o_scsi.c
 delete mode 100644 drivers/message/i2o/iop.c
 delete mode 100644 drivers/message/i2o/memory.c
 delete mode 100644 drivers/message/i2o/pci.c
 create mode 100644 drivers/staging/i2o/Kconfig
 create mode 100644 drivers/staging/i2o/Makefile
 create mode 100644 drivers/staging/i2o/README
 create mode 100644 drivers/staging/i2o/README.ioctl
 create mode 100644 drivers/staging/i2o/bus-osm.c
 create mode 100644 drivers/staging/i2o/config-osm.c
 create mode 100644 drivers/staging/i2o/core.h
 create mode 100644 drivers/staging/i2o/debug.c
 create mode 100644 drivers/staging/i2o/device.c
 create mode 100644 drivers/staging/i2o/driver.c
 create mode 100644 drivers/staging/i2o/exec-osm.c
 create mode 100644 drivers/staging/i2o/i2o.h
 create mode 100644 drivers/staging/i2o/i2o_block.c
 create mode 100644 drivers/staging/i2o/i2o_block.h
 create mode 100644 drivers/staging/i2o/i2o_config.c
 create mode 100644 drivers/staging/i2o/i2o_proc.c
 create mode 100644 drivers/staging/i2o/i2o_scsi.c
 create mode 100644 drivers/staging/i2o/iop.c
 create mode 100644 drivers/staging/i2o/memory.c
 create mode 100644 drivers/staging/i2o/pci.c
 delete mode 100644 include/linux/i2o.h

(limited to 'include/linux')

diff --git a/drivers/Kconfig b/drivers/Kconfig
index c70d6e45dc10..c0cc96bab9e7 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -36,8 +36,6 @@ source "drivers/message/fusion/Kconfig"
 
 source "drivers/firewire/Kconfig"
 
-source "drivers/message/i2o/Kconfig"
-
 source "drivers/macintosh/Kconfig"
 
 source "drivers/net/Kconfig"
diff --git a/drivers/message/Makefile b/drivers/message/Makefile
index 97ef5a01ad11..755676ded67c 100644
--- a/drivers/message/Makefile
+++ b/drivers/message/Makefile
@@ -2,5 +2,4 @@
 # Makefile for MPT based block devices
 #
 
-obj-$(CONFIG_I2O)	+= i2o/
 obj-$(CONFIG_FUSION)	+= fusion/
diff --git a/drivers/message/i2o/Kconfig b/drivers/message/i2o/Kconfig
deleted file mode 100644
index 5afa0e393ecf..000000000000
--- a/drivers/message/i2o/Kconfig
+++ /dev/null
@@ -1,121 +0,0 @@
-
-menuconfig I2O
-	tristate "I2O device support"
-	depends on PCI
-	---help---
-	  The Intelligent Input/Output (I2O) architecture allows hardware
-	  drivers to be split into two parts: an operating system specific
-	  module called the OSM and an hardware specific module called the
-	  HDM. The OSM can talk to a whole range of HDM's, and ideally the
-	  HDM's are not OS dependent. This allows for the same HDM driver to
-	  be used under different operating systems if the relevant OSM is in
-	  place. In order for this to work, you need to have an I2O interface
-	  adapter card in your computer. This card contains a special I/O
-	  processor (IOP), thus allowing high speeds since the CPU does not
-	  have to deal with I/O.
-
-	  If you say Y here, you will get a choice of interface adapter
-	  drivers and OSM's with the following questions.
-
-	  To compile this support as a module, choose M here: the
-	  modules will be called i2o_core.
-
-	  If unsure, say N.
-
-if I2O
-
-config I2O_LCT_NOTIFY_ON_CHANGES
-	bool "Enable LCT notification"
-	default y
-	---help---
-	  Only say N here if you have a I2O controller from SUN. The SUN
-	  firmware doesn't support LCT notification on changes. If this option
-	  is enabled on such a controller the driver will hang up in a endless
-	  loop. On all other controllers say Y.
-
-	  If unsure, say Y.
-
-config I2O_EXT_ADAPTEC
-	bool "Enable Adaptec extensions"
-	default y
-	---help---
-	  Say Y for support of raidutils for Adaptec I2O controllers. You also
-	  have to say Y to "I2O Configuration support", "I2O SCSI OSM" below
-	  and to "SCSI generic support" under "SCSI device configuration".
-
-config I2O_EXT_ADAPTEC_DMA64
-	bool "Enable 64-bit DMA"
-	depends on I2O_EXT_ADAPTEC && ( 64BIT || HIGHMEM64G )
-	default y
-	---help---
-	  Say Y for support of 64-bit DMA transfer mode on Adaptec I2O
-	  controllers.
-	  Note: You need at least firmware version 3709.
-
-config I2O_CONFIG
-	tristate "I2O Configuration support"
-	depends on VIRT_TO_BUS
-	---help---
-	  Say Y for support of the configuration interface for the I2O adapters.
-	  If you have a RAID controller from Adaptec and you want to use the
-	  raidutils to manage your RAID array, you have to say Y here.
-
-	  To compile this support as a module, choose M here: the
-	  module will be called i2o_config.
-
-	  Note: If you want to use the new API you have to download the
-	  i2o_config patch from http://i2o.shadowconnect.com/
-
-config I2O_CONFIG_OLD_IOCTL
-	bool "Enable ioctls (OBSOLETE)"
-	depends on I2O_CONFIG
-	default y
-	---help---
-	  Enables old ioctls.
-
-config I2O_BUS
-	tristate "I2O Bus Adapter OSM"
-	---help---
-	  Include support for the I2O Bus Adapter OSM. The Bus Adapter OSM
-	  provides access to the busses on the I2O controller. The main purpose
-	  is to rescan the bus to find new devices.
-
-	  To compile this support as a module, choose M here: the
-	  module will be called i2o_bus.
-
-config I2O_BLOCK
-	tristate "I2O Block OSM"
-	depends on BLOCK
-	---help---
-	  Include support for the I2O Block OSM. The Block OSM presents disk
-	  and other structured block devices to the operating system. If you
-	  are using an RAID controller, you could access the array only by
-	  the Block OSM driver. But it is possible to access the single disks
-	  by the SCSI OSM driver, for example to monitor the disks.
-
-	  To compile this support as a module, choose M here: the
-	  module will be called i2o_block.
-
-config I2O_SCSI
-	tristate "I2O SCSI OSM"
-	depends on SCSI
-	---help---
-	  Allows direct SCSI access to SCSI devices on a SCSI or FibreChannel
-	  I2O controller. You can use both the SCSI and Block OSM together if
-	  you wish. To access a RAID array, you must use the Block OSM driver.
-	  But you could use the SCSI OSM driver to monitor the single disks.
-
-	  To compile this support as a module, choose M here: the
-	  module will be called i2o_scsi.
-
-config I2O_PROC
-	tristate "I2O /proc support"
-	---help---
-	  If you say Y here and to "/proc file system support", you will be
-	  able to read I2O related information from the virtual directory
-	  /proc/i2o.
-
-	  To compile this support as a module, choose M here: the
-	  module will be called i2o_proc.
-
-endif # I2O
diff --git a/drivers/message/i2o/Makefile b/drivers/message/i2o/Makefile
deleted file mode 100644
index b0982dacfd0a..000000000000
--- a/drivers/message/i2o/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-#
-# Makefile for the kernel I2O OSM.
-#
-# Note : at this point, these files are compiled on all systems.
-# In the future, some of these should be built conditionally.
-#
-
-i2o_core-y		+= iop.o driver.o device.o debug.o pci.o exec-osm.o memory.o
-i2o_bus-y		+= bus-osm.o
-i2o_config-y		+= config-osm.o
-obj-$(CONFIG_I2O)	+= i2o_core.o
-obj-$(CONFIG_I2O_CONFIG)+= i2o_config.o
-obj-$(CONFIG_I2O_BUS)	+= i2o_bus.o
-obj-$(CONFIG_I2O_BLOCK)	+= i2o_block.o
-obj-$(CONFIG_I2O_SCSI)	+= i2o_scsi.o
-obj-$(CONFIG_I2O_PROC)	+= i2o_proc.o
diff --git a/drivers/message/i2o/README b/drivers/message/i2o/README
deleted file mode 100644
index f072a8eb3041..000000000000
--- a/drivers/message/i2o/README
+++ /dev/null
@@ -1,98 +0,0 @@
-
-	Linux I2O Support	(c) Copyright 1999 Red Hat Software
-					and others.
-
-	This program is free software; you can redistribute it and/or
-	modify it under the terms of the GNU General Public License
-	as published by the Free Software Foundation; either version
-	2 of the License, or (at your option) any later version.
-
-AUTHORS (so far)
-
-Alan Cox, Building Number Three Ltd.
-	Core code, SCSI and Block OSMs
-
-Steve Ralston, LSI Logic Corp.
-	Debugging SCSI and Block OSM
-
-Deepak Saxena, Intel Corp.
-	Various core/block extensions
-	/proc interface, bug fixes
-	Ioctl interfaces for control
-	Debugging LAN OSM
-
-Philip Rumpf
-	Fixed assorted dumb SMP locking bugs
-
-Juha Sievanen, University of Helsinki Finland
-	LAN OSM code
-	/proc interface to LAN class
-	Bug fixes
-	Core code extensions
-
-Auvo Häkkinen, University of Helsinki Finland
-	LAN OSM code
-	/Proc interface to LAN class
-	Bug fixes
-	Core code extensions
-
-Taneli Vähäkangas, University of Helsinki Finland
-	Fixes to i2o_config
-
-CREDITS
-
-	This work was made possible by 
-
-Red Hat Software
-	Funding for the Building #3 part of the project
-
-Symbios Logic (Now LSI)
-	Host adapters, hints, known to work platforms when I hit
-	compatibility problems
-
-BoxHill Corporation
-	Loan of initial FibreChannel disk array used for development work.
-
-European Commission
-	Funding the work done by the University of Helsinki
-
-SysKonnect
-        Loan of FDDI and Gigabit Ethernet cards
-
-ASUSTeK
-        Loan of I2O motherboard 
-
-STATUS:
-
-o	The core setup works within limits.
-o	The scsi layer seems to almost work. 
-           I'm still chasing down the hang bug.
-o	The block OSM is mostly functional
-o	LAN OSM works with FDDI and Ethernet cards.
-
-TO DO:
-
-General:
-o	Provide hidden address space if asked
-o	Long term message flow control
-o	PCI IOP's without interrupts are not supported yet
-o	Push FAIL handling into the core
-o	DDM control interfaces for module load etc
-o       Add I2O 2.0 support (Deffered to 2.5 kernel)
-
-Block:
-o	Multiple major numbers
-o	Read ahead and cache handling stuff. Talk to Ingo and people
-o	Power management
-o	Finish Media changers
-
-SCSI:
-o	Find the right way to associate drives/luns/busses
-
-Lan:	
-o	Performance tuning
-o	Test Fibre Channel code
-
-Tape:
-o	Anyone seen anything implementing this ?
-           (D.S: Will attempt to do so if spare cycles permit)
diff --git a/drivers/message/i2o/README.ioctl b/drivers/message/i2o/README.ioctl
deleted file mode 100644
index 4a7d2ebdfc97..000000000000
--- a/drivers/message/i2o/README.ioctl
+++ /dev/null
@@ -1,394 +0,0 @@
-
-Linux I2O User Space Interface
-rev 0.3 - 04/20/99
-
-=============================================================================
-Originally written by Deepak Saxena(deepak@plexity.net)
-Currently maintained by Deepak Saxena(deepak@plexity.net)
-=============================================================================
-
-I. Introduction
-
-The Linux I2O subsystem provides a set of ioctl() commands that can be
-utilized by user space applications to communicate with IOPs and devices
-on individual IOPs. This document defines the specific ioctl() commands
-that are available to the user and provides examples of their uses.
-
-This document assumes the reader is familiar with or has access to the 
-I2O specification as no I2O message parameters are outlined.  For information 
-on the specification, see http://www.i2osig.org
-
-This document and the I2O user space interface are currently maintained
-by Deepak Saxena.  Please send all comments, errata, and bug fixes to
-deepak@csociety.purdue.edu
-
-II. IOP Access
-
-Access to the I2O subsystem is provided through the device file named 
-/dev/i2o/ctl.  This file is a character file with major number 10 and minor
-number 166.  It can be created through the following command:
-
-   mknod /dev/i2o/ctl c 10 166
-
-III. Determining the IOP Count
-
-   SYNOPSIS 
-
-   ioctl(fd, I2OGETIOPS,  int *count);
-
-   u8 count[MAX_I2O_CONTROLLERS];
-
-   DESCRIPTION
-
-   This function returns the system's active IOP table.  count should
-   point to a buffer containing MAX_I2O_CONTROLLERS entries.  Upon 
-   returning, each entry will contain a non-zero value if the given
-   IOP unit is active, and NULL if it is inactive or non-existent.
-
-   RETURN VALUE.
-
-   Returns 0 if no errors occur, and -1 otherwise.  If an error occurs,
-   errno is set appropriately:
-
-     EFAULT   Invalid user space pointer was passed
-
-IV. Getting Hardware Resource Table
-
-   SYNOPSIS 
- 
-   ioctl(fd, I2OHRTGET, struct i2o_cmd_hrt *hrt);
-
-      struct i2o_cmd_hrtlct
-      {
-         u32   iop;      /* IOP unit number */
-         void  *resbuf;  /* Buffer for result */
-         u32   *reslen;  /* Buffer length in bytes */
-      };
-
-   DESCRIPTION
-
-   This function returns the Hardware Resource Table of the IOP specified 
-   by hrt->iop in the buffer pointed to by hrt->resbuf. The actual size of 
-   the data is written into *(hrt->reslen).
-
-   RETURNS
-
-   This function returns 0 if no errors occur. If an error occurs, -1 
-   is returned and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ENOBUFS     Buffer not large enough.  If this occurs, the required
-                  buffer length is written into *(hrt->reslen)
-  
-V. Getting Logical Configuration Table
-   
-   SYNOPSIS 
- 
-   ioctl(fd, I2OLCTGET, struct i2o_cmd_lct *lct);
-
-      struct i2o_cmd_hrtlct
-      {
-         u32   iop;      /* IOP unit number */
-         void  *resbuf;  /* Buffer for result */
-         u32   *reslen;  /* Buffer length in bytes */
-      };
-
-   DESCRIPTION
-
-   This function returns the Logical Configuration Table of the IOP specified
-   by lct->iop in the buffer pointed to by lct->resbuf. The actual size of 
-   the data is written into *(lct->reslen).
-
-   RETURNS
-
-   This function returns 0 if no errors occur. If an error occurs, -1 
-   is returned and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ENOBUFS     Buffer not large enough.  If this occurs, the required
-                  buffer length is written into *(lct->reslen)
-
-VI. Setting Parameters
-   
-   SYNOPSIS 
- 
-   ioctl(fd, I2OPARMSET, struct i2o_parm_setget *ops);
-
-      struct i2o_cmd_psetget
-      {
-         u32   iop;      /* IOP unit number */
-         u32   tid;      /* Target device TID */
-         void  *opbuf;   /* Operation List buffer */
-         u32   oplen;    /* Operation List buffer length in bytes */
-         void  *resbuf;  /* Result List buffer */
-         u32   *reslen;  /* Result List buffer length in bytes */
-      };
-
-   DESCRIPTION
-
-   This function posts a UtilParamsSet message to the device identified
-   by ops->iop and ops->tid.  The operation list for the message is 
-   sent through the ops->opbuf buffer, and the result list is written
-   into the buffer pointed to by ops->resbuf.  The number of bytes 
-   written is placed into *(ops->reslen). 
-
-   RETURNS
-
-   The return value is the size in bytes of the data written into
-   ops->resbuf if no errors occur.  If an error occurs, -1 is returned 
-   and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ENOBUFS     Buffer not large enough.  If this occurs, the required
-                  buffer length is written into *(ops->reslen)
-      ETIMEDOUT   Timeout waiting for reply message
-      ENOMEM      Kernel memory allocation error
-
-   A return value of 0 does not mean that the value was actually
-   changed properly on the IOP.  The user should check the result
-   list to determine the specific status of the transaction.
-
-VII. Getting Parameters
-   
-   SYNOPSIS 
- 
-   ioctl(fd, I2OPARMGET, struct i2o_parm_setget *ops);
-
-      struct i2o_parm_setget
-      {
-         u32   iop;      /* IOP unit number */
-         u32   tid;      /* Target device TID */
-         void  *opbuf;   /* Operation List buffer */
-         u32   oplen;    /* Operation List buffer length in bytes */
-         void  *resbuf;  /* Result List buffer */
-         u32   *reslen;  /* Result List buffer length in bytes */
-      };
-
-   DESCRIPTION
-
-   This function posts a UtilParamsGet message to the device identified
-   by ops->iop and ops->tid.  The operation list for the message is 
-   sent through the ops->opbuf buffer, and the result list is written
-   into the buffer pointed to by ops->resbuf.  The actual size of data
-   written is placed into *(ops->reslen).
-
-   RETURNS
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ENOBUFS     Buffer not large enough.  If this occurs, the required
-                  buffer length is written into *(ops->reslen)
-      ETIMEDOUT   Timeout waiting for reply message
-      ENOMEM      Kernel memory allocation error
-
-   A return value of 0 does not mean that the value was actually
-   properly retrieved.  The user should check the result list 
-   to determine the specific status of the transaction.
-
-VIII. Downloading Software
-   
-   SYNOPSIS 
- 
-   ioctl(fd, I2OSWDL, struct i2o_sw_xfer *sw);
-
-      struct i2o_sw_xfer
-      {
-         u32   iop;       /* IOP unit number */
-         u8    flags;     /* DownloadFlags field */
-         u8    sw_type;   /* Software type */
-         u32   sw_id;     /* Software ID */
-         void  *buf;      /* Pointer to software buffer */
-         u32   *swlen;    /* Length of software buffer */        
-         u32   *maxfrag;  /* Number of fragments */
-         u32   *curfrag;  /* Current fragment number */
-      };
-
-   DESCRIPTION
-
-   This function downloads a software fragment pointed by sw->buf
-   to the iop identified by sw->iop. The DownloadFlags, SwID, SwType
-   and SwSize fields of the ExecSwDownload message are filled in with
-   the values of sw->flags, sw->sw_id, sw->sw_type and *(sw->swlen).
-
-   The fragments _must_ be sent in order and be 8K in size. The last
-   fragment _may_ be shorter, however. The kernel will compute its
-   size based on information in the sw->swlen field.
-
-   Please note that SW transfers can take a long time.
-
-   RETURNS
-
-   This function returns 0 no errors occur. If an error occurs, -1 
-   is returned and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ETIMEDOUT   Timeout waiting for reply message
-      ENOMEM      Kernel memory allocation error
-
-IX. Uploading Software
-   
-   SYNOPSIS 
-
-   ioctl(fd, I2OSWUL, struct i2o_sw_xfer *sw);
-
-      struct i2o_sw_xfer
-      {
-         u32   iop;      /* IOP unit number */
-         u8    flags; 	 /* UploadFlags */
-         u8    sw_type;  /* Software type */
-         u32   sw_id;    /* Software ID */
-         void  *buf;     /* Pointer to software buffer */
-         u32   *swlen;   /* Length of software buffer */        
-         u32   *maxfrag; /* Number of fragments */
-         u32   *curfrag; /* Current fragment number */
-      };
-
-   DESCRIPTION
-
-   This function uploads a software fragment from the IOP identified
-   by sw->iop, sw->sw_type, sw->sw_id and optionally sw->swlen fields.
-   The UploadFlags, SwID, SwType and SwSize fields of the ExecSwUpload
-   message are filled in with the values of sw->flags, sw->sw_id,
-   sw->sw_type and *(sw->swlen).
-
-   The fragments _must_ be requested in order and be 8K in size. The
-   user is responsible for allocating memory pointed by sw->buf. The
-   last fragment _may_ be shorter.
-
-   Please note that SW transfers can take a long time.
-
-   RETURNS
-
-   This function returns 0 if no errors occur.  If an error occurs, -1
-   is returned and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ETIMEDOUT   Timeout waiting for reply message
-      ENOMEM      Kernel memory allocation error
-         
-X. Removing Software
-   
-   SYNOPSIS 
- 
-   ioctl(fd, I2OSWDEL, struct i2o_sw_xfer *sw);
-
-      struct i2o_sw_xfer
-      {
-         u32   iop;      /* IOP unit number */
-         u8    flags; 	 /* RemoveFlags */
-         u8    sw_type;  /* Software type */
-         u32   sw_id;    /* Software ID */
-         void  *buf;     /* Unused */
-         u32   *swlen;   /* Length of the software data */        
-         u32   *maxfrag; /* Unused */
-         u32   *curfrag; /* Unused */
-      };
-
-   DESCRIPTION
-
-   This function removes software from the IOP identified by sw->iop.
-   The RemoveFlags, SwID, SwType and SwSize fields of the ExecSwRemove message 
-   are filled in with the values of sw->flags, sw->sw_id, sw->sw_type and 
-   *(sw->swlen). Give zero in *(sw->len) if the value is unknown. IOP uses 
-   *(sw->swlen) value to verify correct identication of the module to remove. 
-   The actual size of the module is written into *(sw->swlen).
-
-   RETURNS
-
-   This function returns 0 if no errors occur.  If an error occurs, -1
-   is returned and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ETIMEDOUT   Timeout waiting for reply message
-      ENOMEM      Kernel memory allocation error
-
-X. Validating Configuration
-
-   SYNOPSIS
-
-   ioctl(fd, I2OVALIDATE, int *iop);
-	u32 iop;
-
-   DESCRIPTION
-
-   This function posts an ExecConfigValidate message to the controller
-   identified by iop. This message indicates that the current
-   configuration is accepted. The iop changes the status of suspect drivers 
-   to valid and may delete old drivers from its store.
-
-   RETURNS
-
-   This function returns 0 if no erro occur.  If an error occurs, -1 is
-   returned and errno is set appropriately:
-
-      ETIMEDOUT   Timeout waiting for reply message
-      ENXIO       Invalid IOP number
-
-XI. Configuration Dialog
-   
-   SYNOPSIS 
- 
-   ioctl(fd, I2OHTML, struct i2o_html *htquery);
-      struct i2o_html
-      {
-         u32   iop;      /* IOP unit number */
-         u32   tid;      /* Target device ID */
-         u32   page;     /* HTML page */
-         void  *resbuf;  /* Buffer for reply HTML page */
-         u32   *reslen;  /* Length in bytes of reply buffer */
-         void  *qbuf;    /* Pointer to HTTP query string */
-         u32   qlen;     /* Length in bytes of query string buffer */        
-      };
-
-   DESCRIPTION
-
-   This function posts an UtilConfigDialog message to the device identified
-   by htquery->iop and htquery->tid.  The requested HTML page number is 
-   provided by the htquery->page field, and the resultant data is stored 
-   in the buffer pointed to by htquery->resbuf.  If there is an HTTP query 
-   string that is to be sent to the device, it should be sent in the buffer
-   pointed to by htquery->qbuf.  If there is no query string, this field
-   should be set to NULL. The actual size of the reply received is written
-   into *(htquery->reslen).
-  
-   RETURNS
-
-   This function returns 0 if no error occur. If an error occurs, -1
-   is returned and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ENOBUFS     Buffer not large enough.  If this occurs, the required
-                  buffer length is written into *(ops->reslen)
-      ETIMEDOUT   Timeout waiting for reply message
-      ENOMEM      Kernel memory allocation error
-
-XII. Events
-
-    In the process of determining this.  Current idea is to have use
-    the select() interface to allow user apps to periodically poll
-    the /dev/i2o/ctl device for events.  When select() notifies the user
-    that an event is available, the user would call read() to retrieve
-    a list of all the events that are pending for the specific device.
-
-=============================================================================
-Revision History
-=============================================================================
-
-Rev 0.1 - 04/01/99
-- Initial revision
-
-Rev 0.2 - 04/06/99
-- Changed return values to match UNIX ioctl() standard.  Only return values
-  are 0 and -1.  All errors are reported through errno.
-- Added summary of proposed possible event interfaces
-
-Rev 0.3 - 04/20/99
-- Changed all ioctls() to use pointers to user data instead of actual data
-- Updated error values to match the code
diff --git a/drivers/message/i2o/bus-osm.c b/drivers/message/i2o/bus-osm.c
deleted file mode 100644
index c463dc2efc09..000000000000
--- a/drivers/message/i2o/bus-osm.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- *	Bus Adapter OSM
- *
- *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	Fixes/additions:
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>
- *			initial version.
- */
-
-#include <linux/module.h>
-#include <linux/i2o.h>
-
-#define OSM_NAME	"bus-osm"
-#define OSM_VERSION	"1.317"
-#define OSM_DESCRIPTION	"I2O Bus Adapter OSM"
-
-static struct i2o_driver i2o_bus_driver;
-
-/* Bus OSM class handling definition */
-static struct i2o_class_id i2o_bus_class_id[] = {
-	{I2O_CLASS_BUS_ADAPTER},
-	{I2O_CLASS_END}
-};
-
-/**
- *	i2o_bus_scan - Scan the bus for new devices
- *	@dev: I2O device of the bus, which should be scanned
- *
- *	Scans the bus dev for new / removed devices. After the scan a new LCT
- *	will be fetched automatically.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_bus_scan(struct i2o_device *dev)
-{
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return -ETIMEDOUT;
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_BUS_SCAN << 24 | HOST_TID << 12 | dev->lct_data.
-			tid);
-
-	return i2o_msg_post_wait(dev->iop, msg, 60);
-};
-
-/**
- *	i2o_bus_store_scan - Scan the I2O Bus Adapter
- *	@d: device which should be scanned
- *	@attr: device_attribute
- *	@buf: output buffer
- *	@count: buffer size
- *
- *	Returns count.
- */
-static ssize_t i2o_bus_store_scan(struct device *d,
-				  struct device_attribute *attr,
-				  const char *buf, size_t count)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(d);
-	int rc;
-
-	if ((rc = i2o_bus_scan(i2o_dev)))
-		osm_warn("bus scan failed %d\n", rc);
-
-	return count;
-}
-
-/* Bus Adapter OSM device attributes */
-static DEVICE_ATTR(scan, S_IWUSR, NULL, i2o_bus_store_scan);
-
-/**
- *	i2o_bus_probe - verify if dev is a I2O Bus Adapter device and install it
- *	@dev: device to verify if it is a I2O Bus Adapter device
- *
- *	Because we want all Bus Adapters always return 0.
- *	Except when we fail.  Then we are sad.
- *
- *	Returns 0, except when we fail to excel.
- */
-static int i2o_bus_probe(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(get_device(dev));
-	int rc;
-
-	rc = device_create_file(dev, &dev_attr_scan);
-	if (rc)
-		goto err_out;
-
-	osm_info("device added (TID: %03x)\n", i2o_dev->lct_data.tid);
-
-	return 0;
-
-err_out:
-	put_device(dev);
-	return rc;
-};
-
-/**
- *	i2o_bus_remove - remove the I2O Bus Adapter device from the system again
- *	@dev: I2O Bus Adapter device which should be removed
- *
- *	Always returns 0.
- */
-static int i2o_bus_remove(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-
-	device_remove_file(dev, &dev_attr_scan);
-
-	put_device(dev);
-
-	osm_info("device removed (TID: %03x)\n", i2o_dev->lct_data.tid);
-
-	return 0;
-};
-
-/* Bus Adapter OSM driver struct */
-static struct i2o_driver i2o_bus_driver = {
-	.name = OSM_NAME,
-	.classes = i2o_bus_class_id,
-	.driver = {
-		   .probe = i2o_bus_probe,
-		   .remove = i2o_bus_remove,
-		   },
-};
-
-/**
- *	i2o_bus_init - Bus Adapter OSM initialization function
- *
- *	Only register the Bus Adapter OSM in the I2O core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_bus_init(void)
-{
-	int rc;
-
-	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
-
-	/* Register Bus Adapter OSM into I2O core */
-	rc = i2o_driver_register(&i2o_bus_driver);
-	if (rc) {
-		osm_err("Could not register Bus Adapter OSM\n");
-		return rc;
-	}
-
-	return 0;
-};
-
-/**
- *	i2o_bus_exit - Bus Adapter OSM exit function
- *
- *	Unregisters Bus Adapter OSM from I2O core.
- */
-static void __exit i2o_bus_exit(void)
-{
-	i2o_driver_unregister(&i2o_bus_driver);
-};
-
-MODULE_AUTHOR("Markus Lidel <Markus.Lidel@shadowconnect.com>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(OSM_DESCRIPTION);
-MODULE_VERSION(OSM_VERSION);
-
-module_init(i2o_bus_init);
-module_exit(i2o_bus_exit);
diff --git a/drivers/message/i2o/config-osm.c b/drivers/message/i2o/config-osm.c
deleted file mode 100644
index 3bba7aa82e58..000000000000
--- a/drivers/message/i2o/config-osm.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- *	Configuration OSM
- *
- *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	Fixes/additions:
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>
- *			initial version.
- */
-
-#include <linux/module.h>
-#include <linux/i2o.h>
-#include <linux/dcache.h>
-#include <linux/namei.h>
-#include <linux/fs.h>
-
-#include <asm/uaccess.h>
-
-#define OSM_NAME	"config-osm"
-#define OSM_VERSION	"1.323"
-#define OSM_DESCRIPTION	"I2O Configuration OSM"
-
-/* access mode user rw */
-#define S_IWRSR (S_IRUSR | S_IWUSR)
-
-static struct i2o_driver i2o_config_driver;
-
-/* Config OSM driver struct */
-static struct i2o_driver i2o_config_driver = {
-	.name = OSM_NAME,
-};
-
-#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
-#include "i2o_config.c"
-#endif
-
-/**
- *	i2o_config_init - Configuration OSM initialization function
- *
- *	Registers Configuration OSM in the I2O core and if old ioctl's are
- *	compiled in initialize them.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_config_init(void)
-{
-	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
-
-	if (i2o_driver_register(&i2o_config_driver)) {
-		osm_err("handler register failed.\n");
-		return -EBUSY;
-	}
-#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
-	if (i2o_config_old_init()) {
-		osm_err("old config handler initialization failed\n");
-		i2o_driver_unregister(&i2o_config_driver);
-		return -EBUSY;
-	}
-#endif
-
-	return 0;
-}
-
-/**
- *	i2o_config_exit - Configuration OSM exit function
- *
- *	If old ioctl's are compiled in exit remove them and unregisters
- *	Configuration OSM from I2O core.
- */
-static void i2o_config_exit(void)
-{
-#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
-	i2o_config_old_exit();
-#endif
-
-	i2o_driver_unregister(&i2o_config_driver);
-}
-
-MODULE_AUTHOR("Markus Lidel <Markus.Lidel@shadowconnect.com>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(OSM_DESCRIPTION);
-MODULE_VERSION(OSM_VERSION);
-
-module_init(i2o_config_init);
-module_exit(i2o_config_exit);
diff --git a/drivers/message/i2o/core.h b/drivers/message/i2o/core.h
deleted file mode 100644
index 91614f11f89a..000000000000
--- a/drivers/message/i2o/core.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *	I2O core internal declarations
- *
- *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	Fixes/additions:
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>
- *			initial version.
- */
-
-/* Exec-OSM */
-extern struct i2o_driver i2o_exec_driver;
-extern int i2o_exec_lct_get(struct i2o_controller *);
-
-extern int __init i2o_exec_init(void);
-extern void i2o_exec_exit(void);
-
-/* driver */
-extern struct bus_type i2o_bus_type;
-
-extern int i2o_driver_dispatch(struct i2o_controller *, u32);
-
-extern int __init i2o_driver_init(void);
-extern void i2o_driver_exit(void);
-
-/* PCI */
-extern int __init i2o_pci_init(void);
-extern void __exit i2o_pci_exit(void);
-
-/* device */
-extern const struct attribute_group *i2o_device_groups[];
-
-extern void i2o_device_remove(struct i2o_device *);
-extern int i2o_device_parse_lct(struct i2o_controller *);
-
-int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
-		   int oplen, void *reslist, int reslen);
-
-/* IOP */
-extern struct i2o_controller *i2o_iop_alloc(void);
-
-/**
- *	i2o_iop_free - Free the i2o_controller struct
- *	@c: I2O controller to free
- */
-static inline void i2o_iop_free(struct i2o_controller *c)
-{
-	i2o_pool_free(&c->in_msg);
-	kfree(c);
-}
-
-extern int i2o_iop_add(struct i2o_controller *);
-extern void i2o_iop_remove(struct i2o_controller *);
-
-/* control registers relative to c->base */
-#define I2O_IRQ_STATUS	0x30
-#define I2O_IRQ_MASK	0x34
-#define I2O_IN_PORT	0x40
-#define I2O_OUT_PORT	0x44
-
-/* Motorola/Freescale specific register offset */
-#define I2O_MOTOROLA_PORT_OFFSET	0x10400
-
-#define I2O_IRQ_OUTBOUND_POST	0x00000008
diff --git a/drivers/message/i2o/debug.c b/drivers/message/i2o/debug.c
deleted file mode 100644
index ce62d8bfe1c8..000000000000
--- a/drivers/message/i2o/debug.c
+++ /dev/null
@@ -1,472 +0,0 @@
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/i2o.h>
-
-static void i2o_report_util_cmd(u8 cmd);
-static void i2o_report_exec_cmd(u8 cmd);
-static void i2o_report_fail_status(u8 req_status, u32 * msg);
-static void i2o_report_common_status(u8 req_status);
-static void i2o_report_common_dsc(u16 detailed_status);
-
-/*
- * Used for error reporting/debugging purposes.
- * Report Cmd name, Request status, Detailed Status.
- */
-void i2o_report_status(const char *severity, const char *str,
-		       struct i2o_message *m)
-{
-	u32 *msg = (u32 *) m;
-	u8 cmd = (msg[1] >> 24) & 0xFF;
-	u8 req_status = (msg[4] >> 24) & 0xFF;
-	u16 detailed_status = msg[4] & 0xFFFF;
-
-	if (cmd == I2O_CMD_UTIL_EVT_REGISTER)
-		return;		// No status in this reply
-
-	printk("%s%s: ", severity, str);
-
-	if (cmd < 0x1F)		// Utility cmd
-		i2o_report_util_cmd(cmd);
-
-	else if (cmd >= 0xA0 && cmd <= 0xEF)	// Executive cmd
-		i2o_report_exec_cmd(cmd);
-	else
-		printk("Cmd = %0#2x, ", cmd);	// Other cmds
-
-	if (msg[0] & MSG_FAIL) {
-		i2o_report_fail_status(req_status, msg);
-		return;
-	}
-
-	i2o_report_common_status(req_status);
-
-	if (cmd < 0x1F || (cmd >= 0xA0 && cmd <= 0xEF))
-		i2o_report_common_dsc(detailed_status);
-	else
-		printk(" / DetailedStatus = %0#4x.\n",
-		       detailed_status);
-}
-
-/* Used to dump a message to syslog during debugging */
-void i2o_dump_message(struct i2o_message *m)
-{
-#ifdef DEBUG
-	u32 *msg = (u32 *) m;
-	int i;
-	printk(KERN_INFO "Dumping I2O message size %d @ %p\n",
-	       msg[0] >> 16 & 0xffff, msg);
-	for (i = 0; i < ((msg[0] >> 16) & 0xffff); i++)
-		printk(KERN_INFO "  msg[%d] = %0#10x\n", i, msg[i]);
-#endif
-}
-
-/*
- * Used for error reporting/debugging purposes.
- * Following fail status are common to all classes.
- * The preserved message must be handled in the reply handler.
- */
-static void i2o_report_fail_status(u8 req_status, u32 * msg)
-{
-	static char *FAIL_STATUS[] = {
-		"0x80",		/* not used */
-		"SERVICE_SUSPENDED",	/* 0x81 */
-		"SERVICE_TERMINATED",	/* 0x82 */
-		"CONGESTION",
-		"FAILURE",
-		"STATE_ERROR",
-		"TIME_OUT",
-		"ROUTING_FAILURE",
-		"INVALID_VERSION",
-		"INVALID_OFFSET",
-		"INVALID_MSG_FLAGS",
-		"FRAME_TOO_SMALL",
-		"FRAME_TOO_LARGE",
-		"INVALID_TARGET_ID",
-		"INVALID_INITIATOR_ID",
-		"INVALID_INITIATOR_CONTEX",	/* 0x8F */
-		"UNKNOWN_FAILURE"	/* 0xFF */
-	};
-
-	if (req_status == I2O_FSC_TRANSPORT_UNKNOWN_FAILURE)
-		printk("TRANSPORT_UNKNOWN_FAILURE (%0#2x).\n",
-		       req_status);
-	else
-		printk("TRANSPORT_%s.\n",
-		       FAIL_STATUS[req_status & 0x0F]);
-
-	/* Dump some details */
-
-	printk(KERN_ERR "  InitiatorId = %d, TargetId = %d\n",
-	       (msg[1] >> 12) & 0xFFF, msg[1] & 0xFFF);
-	printk(KERN_ERR "  LowestVersion = 0x%02X, HighestVersion = 0x%02X\n",
-	       (msg[4] >> 8) & 0xFF, msg[4] & 0xFF);
-	printk(KERN_ERR "  FailingHostUnit = 0x%04X,  FailingIOP = 0x%03X\n",
-	       msg[5] >> 16, msg[5] & 0xFFF);
-
-	printk(KERN_ERR "  Severity:  0x%02X\n", (msg[4] >> 16) & 0xFF);
-	if (msg[4] & (1 << 16))
-		printk(KERN_DEBUG "(FormatError), "
-		       "this msg can never be delivered/processed.\n");
-	if (msg[4] & (1 << 17))
-		printk(KERN_DEBUG "(PathError), "
-		       "this msg can no longer be delivered/processed.\n");
-	if (msg[4] & (1 << 18))
-		printk(KERN_DEBUG "(PathState), "
-		       "the system state does not allow delivery.\n");
-	if (msg[4] & (1 << 19))
-		printk(KERN_DEBUG
-		       "(Congestion), resources temporarily not available;"
-		       "do not retry immediately.\n");
-}
-
-/*
- * Used for error reporting/debugging purposes.
- * Following reply status are common to all classes.
- */
-static void i2o_report_common_status(u8 req_status)
-{
-	static char *REPLY_STATUS[] = {
-		"SUCCESS",
-		"ABORT_DIRTY",
-		"ABORT_NO_DATA_TRANSFER",
-		"ABORT_PARTIAL_TRANSFER",
-		"ERROR_DIRTY",
-		"ERROR_NO_DATA_TRANSFER",
-		"ERROR_PARTIAL_TRANSFER",
-		"PROCESS_ABORT_DIRTY",
-		"PROCESS_ABORT_NO_DATA_TRANSFER",
-		"PROCESS_ABORT_PARTIAL_TRANSFER",
-		"TRANSACTION_ERROR",
-		"PROGRESS_REPORT"
-	};
-
-	if (req_status >= ARRAY_SIZE(REPLY_STATUS))
-		printk("RequestStatus = %0#2x", req_status);
-	else
-		printk("%s", REPLY_STATUS[req_status]);
-}
-
-/*
- * Used for error reporting/debugging purposes.
- * Following detailed status are valid  for executive class,
- * utility class, DDM class and for transaction error replies.
- */
-static void i2o_report_common_dsc(u16 detailed_status)
-{
-	static char *COMMON_DSC[] = {
-		"SUCCESS",
-		"0x01",		// not used
-		"BAD_KEY",
-		"TCL_ERROR",
-		"REPLY_BUFFER_FULL",
-		"NO_SUCH_PAGE",
-		"INSUFFICIENT_RESOURCE_SOFT",
-		"INSUFFICIENT_RESOURCE_HARD",
-		"0x08",		// not used
-		"CHAIN_BUFFER_TOO_LARGE",
-		"UNSUPPORTED_FUNCTION",
-		"DEVICE_LOCKED",
-		"DEVICE_RESET",
-		"INAPPROPRIATE_FUNCTION",
-		"INVALID_INITIATOR_ADDRESS",
-		"INVALID_MESSAGE_FLAGS",
-		"INVALID_OFFSET",
-		"INVALID_PARAMETER",
-		"INVALID_REQUEST",
-		"INVALID_TARGET_ADDRESS",
-		"MESSAGE_TOO_LARGE",
-		"MESSAGE_TOO_SMALL",
-		"MISSING_PARAMETER",
-		"TIMEOUT",
-		"UNKNOWN_ERROR",
-		"UNKNOWN_FUNCTION",
-		"UNSUPPORTED_VERSION",
-		"DEVICE_BUSY",
-		"DEVICE_NOT_AVAILABLE"
-	};
-
-	if (detailed_status > I2O_DSC_DEVICE_NOT_AVAILABLE)
-		printk(" / DetailedStatus = %0#4x.\n",
-		       detailed_status);
-	else
-		printk(" / %s.\n", COMMON_DSC[detailed_status]);
-}
-
-/*
- * Used for error reporting/debugging purposes
- */
-static void i2o_report_util_cmd(u8 cmd)
-{
-	switch (cmd) {
-	case I2O_CMD_UTIL_NOP:
-		printk("UTIL_NOP, ");
-		break;
-	case I2O_CMD_UTIL_ABORT:
-		printk("UTIL_ABORT, ");
-		break;
-	case I2O_CMD_UTIL_CLAIM:
-		printk("UTIL_CLAIM, ");
-		break;
-	case I2O_CMD_UTIL_RELEASE:
-		printk("UTIL_CLAIM_RELEASE, ");
-		break;
-	case I2O_CMD_UTIL_CONFIG_DIALOG:
-		printk("UTIL_CONFIG_DIALOG, ");
-		break;
-	case I2O_CMD_UTIL_DEVICE_RESERVE:
-		printk("UTIL_DEVICE_RESERVE, ");
-		break;
-	case I2O_CMD_UTIL_DEVICE_RELEASE:
-		printk("UTIL_DEVICE_RELEASE, ");
-		break;
-	case I2O_CMD_UTIL_EVT_ACK:
-		printk("UTIL_EVENT_ACKNOWLEDGE, ");
-		break;
-	case I2O_CMD_UTIL_EVT_REGISTER:
-		printk("UTIL_EVENT_REGISTER, ");
-		break;
-	case I2O_CMD_UTIL_LOCK:
-		printk("UTIL_LOCK, ");
-		break;
-	case I2O_CMD_UTIL_LOCK_RELEASE:
-		printk("UTIL_LOCK_RELEASE, ");
-		break;
-	case I2O_CMD_UTIL_PARAMS_GET:
-		printk("UTIL_PARAMS_GET, ");
-		break;
-	case I2O_CMD_UTIL_PARAMS_SET:
-		printk("UTIL_PARAMS_SET, ");
-		break;
-	case I2O_CMD_UTIL_REPLY_FAULT_NOTIFY:
-		printk("UTIL_REPLY_FAULT_NOTIFY, ");
-		break;
-	default:
-		printk("Cmd = %0#2x, ", cmd);
-	}
-}
-
-/*
- * Used for error reporting/debugging purposes
- */
-static void i2o_report_exec_cmd(u8 cmd)
-{
-	switch (cmd) {
-	case I2O_CMD_ADAPTER_ASSIGN:
-		printk("EXEC_ADAPTER_ASSIGN, ");
-		break;
-	case I2O_CMD_ADAPTER_READ:
-		printk("EXEC_ADAPTER_READ, ");
-		break;
-	case I2O_CMD_ADAPTER_RELEASE:
-		printk("EXEC_ADAPTER_RELEASE, ");
-		break;
-	case I2O_CMD_BIOS_INFO_SET:
-		printk("EXEC_BIOS_INFO_SET, ");
-		break;
-	case I2O_CMD_BOOT_DEVICE_SET:
-		printk("EXEC_BOOT_DEVICE_SET, ");
-		break;
-	case I2O_CMD_CONFIG_VALIDATE:
-		printk("EXEC_CONFIG_VALIDATE, ");
-		break;
-	case I2O_CMD_CONN_SETUP:
-		printk("EXEC_CONN_SETUP, ");
-		break;
-	case I2O_CMD_DDM_DESTROY:
-		printk("EXEC_DDM_DESTROY, ");
-		break;
-	case I2O_CMD_DDM_ENABLE:
-		printk("EXEC_DDM_ENABLE, ");
-		break;
-	case I2O_CMD_DDM_QUIESCE:
-		printk("EXEC_DDM_QUIESCE, ");
-		break;
-	case I2O_CMD_DDM_RESET:
-		printk("EXEC_DDM_RESET, ");
-		break;
-	case I2O_CMD_DDM_SUSPEND:
-		printk("EXEC_DDM_SUSPEND, ");
-		break;
-	case I2O_CMD_DEVICE_ASSIGN:
-		printk("EXEC_DEVICE_ASSIGN, ");
-		break;
-	case I2O_CMD_DEVICE_RELEASE:
-		printk("EXEC_DEVICE_RELEASE, ");
-		break;
-	case I2O_CMD_HRT_GET:
-		printk("EXEC_HRT_GET, ");
-		break;
-	case I2O_CMD_ADAPTER_CLEAR:
-		printk("EXEC_IOP_CLEAR, ");
-		break;
-	case I2O_CMD_ADAPTER_CONNECT:
-		printk("EXEC_IOP_CONNECT, ");
-		break;
-	case I2O_CMD_ADAPTER_RESET:
-		printk("EXEC_IOP_RESET, ");
-		break;
-	case I2O_CMD_LCT_NOTIFY:
-		printk("EXEC_LCT_NOTIFY, ");
-		break;
-	case I2O_CMD_OUTBOUND_INIT:
-		printk("EXEC_OUTBOUND_INIT, ");
-		break;
-	case I2O_CMD_PATH_ENABLE:
-		printk("EXEC_PATH_ENABLE, ");
-		break;
-	case I2O_CMD_PATH_QUIESCE:
-		printk("EXEC_PATH_QUIESCE, ");
-		break;
-	case I2O_CMD_PATH_RESET:
-		printk("EXEC_PATH_RESET, ");
-		break;
-	case I2O_CMD_STATIC_MF_CREATE:
-		printk("EXEC_STATIC_MF_CREATE, ");
-		break;
-	case I2O_CMD_STATIC_MF_RELEASE:
-		printk("EXEC_STATIC_MF_RELEASE, ");
-		break;
-	case I2O_CMD_STATUS_GET:
-		printk("EXEC_STATUS_GET, ");
-		break;
-	case I2O_CMD_SW_DOWNLOAD:
-		printk("EXEC_SW_DOWNLOAD, ");
-		break;
-	case I2O_CMD_SW_UPLOAD:
-		printk("EXEC_SW_UPLOAD, ");
-		break;
-	case I2O_CMD_SW_REMOVE:
-		printk("EXEC_SW_REMOVE, ");
-		break;
-	case I2O_CMD_SYS_ENABLE:
-		printk("EXEC_SYS_ENABLE, ");
-		break;
-	case I2O_CMD_SYS_MODIFY:
-		printk("EXEC_SYS_MODIFY, ");
-		break;
-	case I2O_CMD_SYS_QUIESCE:
-		printk("EXEC_SYS_QUIESCE, ");
-		break;
-	case I2O_CMD_SYS_TAB_SET:
-		printk("EXEC_SYS_TAB_SET, ");
-		break;
-	default:
-		printk("Cmd = %#02x, ", cmd);
-	}
-}
-
-void i2o_debug_state(struct i2o_controller *c)
-{
-	printk(KERN_INFO "%s: State = ", c->name);
-	switch (((i2o_status_block *) c->status_block.virt)->iop_state) {
-	case 0x01:
-		printk("INIT\n");
-		break;
-	case 0x02:
-		printk("RESET\n");
-		break;
-	case 0x04:
-		printk("HOLD\n");
-		break;
-	case 0x05:
-		printk("READY\n");
-		break;
-	case 0x08:
-		printk("OPERATIONAL\n");
-		break;
-	case 0x10:
-		printk("FAILED\n");
-		break;
-	case 0x11:
-		printk("FAULTED\n");
-		break;
-	default:
-		printk("%x (unknown !!)\n",
-		       ((i2o_status_block *) c->status_block.virt)->iop_state);
-	}
-};
-
-void i2o_dump_hrt(struct i2o_controller *c)
-{
-	u32 *rows = (u32 *) c->hrt.virt;
-	u8 *p = (u8 *) c->hrt.virt;
-	u8 *d;
-	int count;
-	int length;
-	int i;
-	int state;
-
-	if (p[3] != 0) {
-		printk(KERN_ERR
-		       "%s: HRT table for controller is too new a version.\n",
-		       c->name);
-		return;
-	}
-
-	count = p[0] | (p[1] << 8);
-	length = p[2];
-
-	printk(KERN_INFO "%s: HRT has %d entries of %d bytes each.\n",
-	       c->name, count, length << 2);
-
-	rows += 2;
-
-	for (i = 0; i < count; i++) {
-		printk(KERN_INFO "Adapter %08X: ", rows[0]);
-		p = (u8 *) (rows + 1);
-		d = (u8 *) (rows + 2);
-		state = p[1] << 8 | p[0];
-
-		printk("TID %04X:[", state & 0xFFF);
-		state >>= 12;
-		if (state & (1 << 0))
-			printk("H");	/* Hidden */
-		if (state & (1 << 2)) {
-			printk("P");	/* Present */
-			if (state & (1 << 1))
-				printk("C");	/* Controlled */
-		}
-		if (state > 9)
-			printk("*");	/* Hard */
-
-		printk("]:");
-
-		switch (p[3] & 0xFFFF) {
-		case 0:
-			/* Adapter private bus - easy */
-			printk("Local bus %d: I/O at 0x%04X Mem 0x%08X", p[2],
-			       d[1] << 8 | d[0], *(u32 *) (d + 4));
-			break;
-		case 1:
-			/* ISA bus */
-			printk("ISA %d: CSN %d I/O at 0x%04X Mem 0x%08X", p[2],
-			       d[2], d[1] << 8 | d[0], *(u32 *) (d + 4));
-			break;
-
-		case 2:	/* EISA bus */
-			printk("EISA %d: Slot %d I/O at 0x%04X Mem 0x%08X",
-			       p[2], d[3], d[1] << 8 | d[0], *(u32 *) (d + 4));
-			break;
-
-		case 3:	/* MCA bus */
-			printk("MCA %d: Slot %d I/O at 0x%04X Mem 0x%08X", p[2],
-			       d[3], d[1] << 8 | d[0], *(u32 *) (d + 4));
-			break;
-
-		case 4:	/* PCI bus */
-			printk("PCI %d: Bus %d Device %d Function %d", p[2],
-			       d[2], d[1], d[0]);
-			break;
-
-		case 0x80:	/* Other */
-		default:
-			printk("Unsupported bus type.");
-			break;
-		}
-		printk("\n");
-		rows += length;
-	}
-}
-
-EXPORT_SYMBOL(i2o_dump_message);
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
deleted file mode 100644
index 98348f420b52..000000000000
--- a/drivers/message/i2o/device.c
+++ /dev/null
@@ -1,594 +0,0 @@
-/*
- *	Functions to handle I2O devices
- *
- *	Copyright (C) 2004	Markus Lidel <Markus.Lidel@shadowconnect.com>
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	Fixes/additions:
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>
- *			initial version.
- */
-
-#include <linux/module.h>
-#include <linux/i2o.h>
-#include <linux/delay.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include "core.h"
-
-/**
- *	i2o_device_issue_claim - claim or release a device
- *	@dev: I2O device to claim or release
- *	@cmd: claim or release command
- *	@type: type of claim
- *
- *	Issue I2O UTIL_CLAIM or UTIL_RELEASE messages. The message to be sent
- *	is set by cmd. dev is the I2O device which should be claim or
- *	released and the type is the claim type (see the I2O spec).
- *
- *	Returs 0 on success or negative error code on failure.
- */
-static inline int i2o_device_issue_claim(struct i2o_device *dev, u32 cmd,
-					 u32 type)
-{
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(cmd << 24 | HOST_TID << 12 | dev->lct_data.tid);
-	msg->body[0] = cpu_to_le32(type);
-
-	return i2o_msg_post_wait(dev->iop, msg, 60);
-}
-
-/**
- *	i2o_device_claim - claim a device for use by an OSM
- *	@dev: I2O device to claim
- *
- *	Do the leg work to assign a device to a given OSM. If the claim succeeds,
- *	the owner is the primary. If the attempt fails a negative errno code
- *	is returned. On success zero is returned.
- */
-int i2o_device_claim(struct i2o_device *dev)
-{
-	int rc = 0;
-
-	mutex_lock(&dev->lock);
-
-	rc = i2o_device_issue_claim(dev, I2O_CMD_UTIL_CLAIM, I2O_CLAIM_PRIMARY);
-	if (!rc)
-		pr_debug("i2o: claim of device %d succeeded\n",
-			 dev->lct_data.tid);
-	else
-		pr_debug("i2o: claim of device %d failed %d\n",
-			 dev->lct_data.tid, rc);
-
-	mutex_unlock(&dev->lock);
-
-	return rc;
-}
-
-/**
- *	i2o_device_claim_release - release a device that the OSM is using
- *	@dev: device to release
- *
- *	Drop a claim by an OSM on a given I2O device.
- *
- *	AC - some devices seem to want to refuse an unclaim until they have
- *	finished internal processing. It makes sense since you don't want a
- *	new device to go reconfiguring the entire system until you are done.
- *	Thus we are prepared to wait briefly.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_device_claim_release(struct i2o_device *dev)
-{
-	int tries;
-	int rc = 0;
-
-	mutex_lock(&dev->lock);
-
-	/*
-	 *      If the controller takes a nonblocking approach to
-	 *      releases we have to sleep/poll for a few times.
-	 */
-	for (tries = 0; tries < 10; tries++) {
-		rc = i2o_device_issue_claim(dev, I2O_CMD_UTIL_RELEASE,
-					    I2O_CLAIM_PRIMARY);
-		if (!rc)
-			break;
-
-		ssleep(1);
-	}
-
-	if (!rc)
-		pr_debug("i2o: claim release of device %d succeeded\n",
-			 dev->lct_data.tid);
-	else
-		pr_debug("i2o: claim release of device %d failed %d\n",
-			 dev->lct_data.tid, rc);
-
-	mutex_unlock(&dev->lock);
-
-	return rc;
-}
-
-/**
- *	i2o_device_release - release the memory for a I2O device
- *	@dev: I2O device which should be released
- *
- *	Release the allocated memory. This function is called if refcount of
- *	device reaches 0 automatically.
- */
-static void i2o_device_release(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-
-	pr_debug("i2o: device %s released\n", dev_name(dev));
-
-	kfree(i2o_dev);
-}
-
-/**
- *	class_id_show - Displays class id of I2O device
- *	@dev: device of which the class id should be displayed
- *	@attr: pointer to device attribute
- *	@buf: buffer into which the class id should be printed
- *
- *	Returns the number of bytes which are printed into the buffer.
- */
-static ssize_t class_id_show(struct device *dev, struct device_attribute *attr,
-			     char *buf)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-
-	sprintf(buf, "0x%03x\n", i2o_dev->lct_data.class_id);
-	return strlen(buf) + 1;
-}
-static DEVICE_ATTR_RO(class_id);
-
-/**
- *	tid_show - Displays TID of I2O device
- *	@dev: device of which the TID should be displayed
- *	@attr: pointer to device attribute
- *	@buf: buffer into which the TID should be printed
- *
- *	Returns the number of bytes which are printed into the buffer.
- */
-static ssize_t tid_show(struct device *dev, struct device_attribute *attr,
-			char *buf)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-
-	sprintf(buf, "0x%03x\n", i2o_dev->lct_data.tid);
-	return strlen(buf) + 1;
-}
-static DEVICE_ATTR_RO(tid);
-
-/* I2O device attributes */
-static struct attribute *i2o_device_attrs[] = {
-	&dev_attr_class_id.attr,
-	&dev_attr_tid.attr,
-	NULL,
-};
-
-static const struct attribute_group i2o_device_group = {
-	.attrs = i2o_device_attrs,
-};
-
-const struct attribute_group *i2o_device_groups[] = {
-	&i2o_device_group,
-	NULL,
-};
-
-/**
- *	i2o_device_alloc - Allocate a I2O device and initialize it
- *
- *	Allocate the memory for a I2O device and initialize locks and lists
- *
- *	Returns the allocated I2O device or a negative error code if the device
- *	could not be allocated.
- */
-static struct i2o_device *i2o_device_alloc(void)
-{
-	struct i2o_device *dev;
-
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return ERR_PTR(-ENOMEM);
-
-	INIT_LIST_HEAD(&dev->list);
-	mutex_init(&dev->lock);
-
-	dev->device.bus = &i2o_bus_type;
-	dev->device.release = &i2o_device_release;
-
-	return dev;
-}
-
-/**
- *	i2o_device_add - allocate a new I2O device and add it to the IOP
- *	@c: I2O controller that the device is on
- *	@entry: LCT entry of the I2O device
- *
- *	Allocate a new I2O device and initialize it with the LCT entry. The
- *	device is appended to the device list of the controller.
- *
- *	Returns zero on success, or a -ve errno.
- */
-static int i2o_device_add(struct i2o_controller *c, i2o_lct_entry *entry)
-{
-	struct i2o_device *i2o_dev, *tmp;
-	int rc;
-
-	i2o_dev = i2o_device_alloc();
-	if (IS_ERR(i2o_dev)) {
-		printk(KERN_ERR "i2o: unable to allocate i2o device\n");
-		return PTR_ERR(i2o_dev);
-	}
-
-	i2o_dev->lct_data = *entry;
-
-	dev_set_name(&i2o_dev->device, "%d:%03x", c->unit,
-		     i2o_dev->lct_data.tid);
-
-	i2o_dev->iop = c;
-	i2o_dev->device.parent = &c->device;
-
-	rc = device_register(&i2o_dev->device);
-	if (rc)
-		goto err;
-
-	list_add_tail(&i2o_dev->list, &c->devices);
-
-	/* create user entries for this device */
-	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.user_tid);
-	if (tmp && (tmp != i2o_dev)) {
-		rc = sysfs_create_link(&i2o_dev->device.kobj,
-				       &tmp->device.kobj, "user");
-		if (rc)
-			goto unreg_dev;
-	}
-
-	/* create user entries referring to this device */
-	list_for_each_entry(tmp, &c->devices, list)
-	    if ((tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
-		&& (tmp != i2o_dev)) {
-		rc = sysfs_create_link(&tmp->device.kobj,
-				       &i2o_dev->device.kobj, "user");
-		if (rc)
-			goto rmlink1;
-	}
-
-	/* create parent entries for this device */
-	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.parent_tid);
-	if (tmp && (tmp != i2o_dev)) {
-		rc = sysfs_create_link(&i2o_dev->device.kobj,
-				       &tmp->device.kobj, "parent");
-		if (rc)
-			goto rmlink1;
-	}
-
-	/* create parent entries referring to this device */
-	list_for_each_entry(tmp, &c->devices, list)
-	    if ((tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
-		&& (tmp != i2o_dev)) {
-		rc = sysfs_create_link(&tmp->device.kobj,
-				       &i2o_dev->device.kobj, "parent");
-		if (rc)
-			goto rmlink2;
-	}
-
-	i2o_driver_notify_device_add_all(i2o_dev);
-
-	pr_debug("i2o: device %s added\n", dev_name(&i2o_dev->device));
-
-	return 0;
-
-rmlink2:
-	/* If link creating failed halfway, we loop whole list to cleanup.
-	 * And we don't care wrong removing of link, because sysfs_remove_link
-	 * will take care of it.
-	 */
-	list_for_each_entry(tmp, &c->devices, list) {
-		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
-			sysfs_remove_link(&tmp->device.kobj, "parent");
-	}
-	sysfs_remove_link(&i2o_dev->device.kobj, "parent");
-rmlink1:
-	list_for_each_entry(tmp, &c->devices, list)
-		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
-			sysfs_remove_link(&tmp->device.kobj, "user");
-	sysfs_remove_link(&i2o_dev->device.kobj, "user");
-unreg_dev:
-	list_del(&i2o_dev->list);
-	device_unregister(&i2o_dev->device);
-err:
-	kfree(i2o_dev);
-	return rc;
-}
-
-/**
- *	i2o_device_remove - remove an I2O device from the I2O core
- *	@i2o_dev: I2O device which should be released
- *
- *	Is used on I2O controller removal or LCT modification, when the device
- *	is removed from the system. Note that the device could still hang
- *	around until the refcount reaches 0.
- */
-void i2o_device_remove(struct i2o_device *i2o_dev)
-{
-	struct i2o_device *tmp;
-	struct i2o_controller *c = i2o_dev->iop;
-
-	i2o_driver_notify_device_remove_all(i2o_dev);
-
-	sysfs_remove_link(&i2o_dev->device.kobj, "parent");
-	sysfs_remove_link(&i2o_dev->device.kobj, "user");
-
-	list_for_each_entry(tmp, &c->devices, list) {
-		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
-			sysfs_remove_link(&tmp->device.kobj, "parent");
-		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
-			sysfs_remove_link(&tmp->device.kobj, "user");
-	}
-	list_del(&i2o_dev->list);
-
-	device_unregister(&i2o_dev->device);
-}
-
-/**
- *	i2o_device_parse_lct - Parse a previously fetched LCT and create devices
- *	@c: I2O controller from which the LCT should be parsed.
- *
- *	The Logical Configuration Table tells us what we can talk to on the
- *	board. For every entry we create an I2O device, which is registered in
- *	the I2O core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_device_parse_lct(struct i2o_controller *c)
-{
-	struct i2o_device *dev, *tmp;
-	i2o_lct *lct;
-	u32 *dlct = c->dlct.virt;
-	int max = 0, i = 0;
-	u16 table_size;
-	u32 buf;
-
-	mutex_lock(&c->lct_lock);
-
-	kfree(c->lct);
-
-	buf = le32_to_cpu(*dlct++);
-	table_size = buf & 0xffff;
-
-	lct = c->lct = kmalloc(table_size * 4, GFP_KERNEL);
-	if (!lct) {
-		mutex_unlock(&c->lct_lock);
-		return -ENOMEM;
-	}
-
-	lct->lct_ver = buf >> 28;
-	lct->boot_tid = buf >> 16 & 0xfff;
-	lct->table_size = table_size;
-	lct->change_ind = le32_to_cpu(*dlct++);
-	lct->iop_flags = le32_to_cpu(*dlct++);
-
-	table_size -= 3;
-
-	pr_debug("%s: LCT has %d entries (LCT size: %d)\n", c->name, max,
-		 lct->table_size);
-
-	while (table_size > 0) {
-		i2o_lct_entry *entry = &lct->lct_entry[max];
-		int found = 0;
-
-		buf = le32_to_cpu(*dlct++);
-		entry->entry_size = buf & 0xffff;
-		entry->tid = buf >> 16 & 0xfff;
-
-		entry->change_ind = le32_to_cpu(*dlct++);
-		entry->device_flags = le32_to_cpu(*dlct++);
-
-		buf = le32_to_cpu(*dlct++);
-		entry->class_id = buf & 0xfff;
-		entry->version = buf >> 12 & 0xf;
-		entry->vendor_id = buf >> 16;
-
-		entry->sub_class = le32_to_cpu(*dlct++);
-
-		buf = le32_to_cpu(*dlct++);
-		entry->user_tid = buf & 0xfff;
-		entry->parent_tid = buf >> 12 & 0xfff;
-		entry->bios_info = buf >> 24;
-
-		memcpy(&entry->identity_tag, dlct, 8);
-		dlct += 2;
-
-		entry->event_capabilities = le32_to_cpu(*dlct++);
-
-		/* add new devices, which are new in the LCT */
-		list_for_each_entry_safe(dev, tmp, &c->devices, list) {
-			if (entry->tid == dev->lct_data.tid) {
-				found = 1;
-				break;
-			}
-		}
-
-		if (!found)
-			i2o_device_add(c, entry);
-
-		table_size -= 9;
-		max++;
-	}
-
-	/* remove devices, which are not in the LCT anymore */
-	list_for_each_entry_safe(dev, tmp, &c->devices, list) {
-		int found = 0;
-
-		for (i = 0; i < max; i++) {
-			if (lct->lct_entry[i].tid == dev->lct_data.tid) {
-				found = 1;
-				break;
-			}
-		}
-
-		if (!found)
-			i2o_device_remove(dev);
-	}
-
-	mutex_unlock(&c->lct_lock);
-
-	return 0;
-}
-
-/*
- *	Run time support routines
- */
-
-/*	Issue UTIL_PARAMS_GET or UTIL_PARAMS_SET
- *
- *	This function can be used for all UtilParamsGet/Set operations.
- *	The OperationList is given in oplist-buffer,
- *	and results are returned in reslist-buffer.
- *	Note that the minimum sized reslist is 8 bytes and contains
- *	ResultCount, ErrorInfoSize, BlockStatus and BlockSize.
- */
-int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
-		   int oplen, void *reslist, int reslen)
-{
-	struct i2o_message *msg;
-	int i = 0;
-	int rc;
-	struct i2o_dma res;
-	struct i2o_controller *c = i2o_dev->iop;
-	struct device *dev = &c->pdev->dev;
-
-	res.virt = NULL;
-
-	if (i2o_dma_alloc(dev, &res, reslen))
-		return -ENOMEM;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg)) {
-		i2o_dma_free(dev, &res);
-		return PTR_ERR(msg);
-	}
-
-	i = 0;
-	msg->u.head[1] =
-	    cpu_to_le32(cmd << 24 | HOST_TID << 12 | i2o_dev->lct_data.tid);
-	msg->body[i++] = cpu_to_le32(0x00000000);
-	msg->body[i++] = cpu_to_le32(0x4C000000 | oplen);	/* OperationList */
-	memcpy(&msg->body[i], oplist, oplen);
-	i += (oplen / 4 + (oplen % 4 ? 1 : 0));
-	msg->body[i++] = cpu_to_le32(0xD0000000 | res.len);	/* ResultList */
-	msg->body[i++] = cpu_to_le32(res.phys);
-
-	msg->u.head[0] =
-	    cpu_to_le32(I2O_MESSAGE_SIZE(i + sizeof(struct i2o_message) / 4) |
-			SGL_OFFSET_5);
-
-	rc = i2o_msg_post_wait_mem(c, msg, 10, &res);
-
-	/* This only looks like a memory leak - don't "fix" it. */
-	if (rc == -ETIMEDOUT)
-		return rc;
-
-	memcpy(reslist, res.virt, res.len);
-	i2o_dma_free(dev, &res);
-
-	return rc;
-}
-
-/*
- *	 Query one field group value or a whole scalar group.
- */
-int i2o_parm_field_get(struct i2o_device *i2o_dev, int group, int field,
-		       void *buf, int buflen)
-{
-	u32 opblk[] = { cpu_to_le32(0x00000001),
-		cpu_to_le32((u16) group << 16 | I2O_PARAMS_FIELD_GET),
-		cpu_to_le32((s16) field << 16 | 0x00000001)
-	};
-	u8 *resblk;		/* 8 bytes for header */
-	int rc;
-
-	resblk = kmalloc(buflen + 8, GFP_KERNEL);
-	if (!resblk)
-		return -ENOMEM;
-
-	rc = i2o_parm_issue(i2o_dev, I2O_CMD_UTIL_PARAMS_GET, opblk,
-			    sizeof(opblk), resblk, buflen + 8);
-
-	memcpy(buf, resblk + 8, buflen);	/* cut off header */
-
-	kfree(resblk);
-
-	return rc;
-}
-
-/*
- *	if oper == I2O_PARAMS_TABLE_GET, get from all rows
- *		if fieldcount == -1 return all fields
- *			ibuf and ibuflen are unused (use NULL, 0)
- *		else return specific fields
- *			ibuf contains fieldindexes
- *
- *	if oper == I2O_PARAMS_LIST_GET, get from specific rows
- *		if fieldcount == -1 return all fields
- *			ibuf contains rowcount, keyvalues
- *		else return specific fields
- *			fieldcount is # of fieldindexes
- *			ibuf contains fieldindexes, rowcount, keyvalues
- *
- *	You could also use directly function i2o_issue_params().
- */
-int i2o_parm_table_get(struct i2o_device *dev, int oper, int group,
-		       int fieldcount, void *ibuf, int ibuflen, void *resblk,
-		       int reslen)
-{
-	u16 *opblk;
-	int size;
-
-	size = 10 + ibuflen;
-	if (size % 4)
-		size += 4 - size % 4;
-
-	opblk = kmalloc(size, GFP_KERNEL);
-	if (opblk == NULL) {
-		printk(KERN_ERR "i2o: no memory for query buffer.\n");
-		return -ENOMEM;
-	}
-
-	opblk[0] = 1;		/* operation count */
-	opblk[1] = 0;		/* pad */
-	opblk[2] = oper;
-	opblk[3] = group;
-	opblk[4] = fieldcount;
-	memcpy(opblk + 5, ibuf, ibuflen);	/* other params */
-
-	size = i2o_parm_issue(dev, I2O_CMD_UTIL_PARAMS_GET, opblk,
-			      size, resblk, reslen);
-
-	kfree(opblk);
-	if (size > reslen)
-		return reslen;
-
-	return size;
-}
-
-EXPORT_SYMBOL(i2o_device_claim);
-EXPORT_SYMBOL(i2o_device_claim_release);
-EXPORT_SYMBOL(i2o_parm_field_get);
-EXPORT_SYMBOL(i2o_parm_table_get);
-EXPORT_SYMBOL(i2o_parm_issue);
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
deleted file mode 100644
index 1b18a0d1d05b..000000000000
--- a/drivers/message/i2o/driver.c
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- *	Functions to handle I2O drivers (OSMs) and I2O bus type for sysfs
- *
- *	Copyright (C) 2004	Markus Lidel <Markus.Lidel@shadowconnect.com>
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	Fixes/additions:
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>
- *			initial version.
- */
-
-#include <linux/device.h>
-#include <linux/module.h>
-#include <linux/rwsem.h>
-#include <linux/i2o.h>
-#include <linux/workqueue.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include "core.h"
-
-#define OSM_NAME	"i2o"
-
-/* max_drivers - Maximum I2O drivers (OSMs) which could be registered */
-static unsigned int i2o_max_drivers = I2O_MAX_DRIVERS;
-module_param_named(max_drivers, i2o_max_drivers, uint, 0);
-MODULE_PARM_DESC(max_drivers, "maximum number of OSM's to support");
-
-/* I2O drivers lock and array */
-static spinlock_t i2o_drivers_lock;
-static struct i2o_driver **i2o_drivers;
-
-/**
- *	i2o_bus_match - Tell if I2O device class id matches the class ids of the I2O driver (OSM)
- *	@dev: device which should be verified
- *	@drv: the driver to match against
- *
- *	Used by the bus to check if the driver wants to handle the device.
- *
- *	Returns 1 if the class ids of the driver match the class id of the
- *	device, otherwise 0.
- */
-static int i2o_bus_match(struct device *dev, struct device_driver *drv)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	struct i2o_driver *i2o_drv = to_i2o_driver(drv);
-	struct i2o_class_id *ids = i2o_drv->classes;
-
-	if (ids)
-		while (ids->class_id != I2O_CLASS_END) {
-			if (ids->class_id == i2o_dev->lct_data.class_id)
-				return 1;
-			ids++;
-		}
-	return 0;
-};
-
-/* I2O bus type */
-struct bus_type i2o_bus_type = {
-	.name = "i2o",
-	.match = i2o_bus_match,
-	.dev_groups = i2o_device_groups,
-};
-
-/**
- *	i2o_driver_register - Register a I2O driver (OSM) in the I2O core
- *	@drv: I2O driver which should be registered
- *
- *	Registers the OSM drv in the I2O core and creates an event queues if
- *	necessary.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_driver_register(struct i2o_driver *drv)
-{
-	struct i2o_controller *c;
-	int i;
-	int rc = 0;
-	unsigned long flags;
-
-	osm_debug("Register driver %s\n", drv->name);
-
-	if (drv->event) {
-		drv->event_queue = alloc_workqueue("%s", WQ_MEM_RECLAIM, 1,
-						   drv->name);
-		if (!drv->event_queue) {
-			osm_err("Could not initialize event queue for driver "
-				"%s\n", drv->name);
-			return -EFAULT;
-		}
-		osm_debug("Event queue initialized for driver %s\n", drv->name);
-	} else
-		drv->event_queue = NULL;
-
-	drv->driver.name = drv->name;
-	drv->driver.bus = &i2o_bus_type;
-
-	spin_lock_irqsave(&i2o_drivers_lock, flags);
-
-	for (i = 0; i2o_drivers[i]; i++)
-		if (i >= i2o_max_drivers) {
-			osm_err("too many drivers registered, increase "
-				"max_drivers\n");
-			spin_unlock_irqrestore(&i2o_drivers_lock, flags);
-			rc = -EFAULT;
-			goto out;
-		}
-
-	drv->context = i;
-	i2o_drivers[i] = drv;
-
-	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
-
-	osm_debug("driver %s gets context id %d\n", drv->name, drv->context);
-
-	list_for_each_entry(c, &i2o_controllers, list) {
-		struct i2o_device *i2o_dev;
-
-		i2o_driver_notify_controller_add(drv, c);
-		list_for_each_entry(i2o_dev, &c->devices, list)
-		    i2o_driver_notify_device_add(drv, i2o_dev);
-	}
-
-	rc = driver_register(&drv->driver);
-	if (rc)
-		goto out;
-
-	return 0;
-out:
-	if (drv->event_queue) {
-		destroy_workqueue(drv->event_queue);
-		drv->event_queue = NULL;
-	}
-
-	return rc;
-};
-
-/**
- *	i2o_driver_unregister - Unregister a I2O driver (OSM) from the I2O core
- *	@drv: I2O driver which should be unregistered
- *
- *	Unregisters the OSM drv from the I2O core and cleanup event queues if
- *	necessary.
- */
-void i2o_driver_unregister(struct i2o_driver *drv)
-{
-	struct i2o_controller *c;
-	unsigned long flags;
-
-	osm_debug("unregister driver %s\n", drv->name);
-
-	driver_unregister(&drv->driver);
-
-	list_for_each_entry(c, &i2o_controllers, list) {
-		struct i2o_device *i2o_dev;
-
-		list_for_each_entry(i2o_dev, &c->devices, list)
-		    i2o_driver_notify_device_remove(drv, i2o_dev);
-
-		i2o_driver_notify_controller_remove(drv, c);
-	}
-
-	spin_lock_irqsave(&i2o_drivers_lock, flags);
-	i2o_drivers[drv->context] = NULL;
-	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
-
-	if (drv->event_queue) {
-		destroy_workqueue(drv->event_queue);
-		drv->event_queue = NULL;
-		osm_debug("event queue removed for %s\n", drv->name);
-	}
-};
-
-/**
- *	i2o_driver_dispatch - dispatch an I2O reply message
- *	@c: I2O controller of the message
- *	@m: I2O message number
- *
- *	The reply is delivered to the driver from which the original message
- *	was. This function is only called from interrupt context.
- *
- *	Returns 0 on success and the message should not be flushed. Returns > 0
- *	on success and if the message should be flushed afterwords. Returns
- *	negative error code on failure (the message will be flushed too).
- */
-int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
-{
-	struct i2o_driver *drv;
-	struct i2o_message *msg = i2o_msg_out_to_virt(c, m);
-	u32 context = le32_to_cpu(msg->u.s.icntxt);
-	unsigned long flags;
-
-	if (unlikely(context >= i2o_max_drivers)) {
-		osm_warn("%s: Spurious reply to unknown driver %d\n", c->name,
-			 context);
-		return -EIO;
-	}
-
-	spin_lock_irqsave(&i2o_drivers_lock, flags);
-	drv = i2o_drivers[context];
-	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
-
-	if (unlikely(!drv)) {
-		osm_warn("%s: Spurious reply to unknown driver %d\n", c->name,
-			 context);
-		return -EIO;
-	}
-
-	if ((le32_to_cpu(msg->u.head[1]) >> 24) == I2O_CMD_UTIL_EVT_REGISTER) {
-		struct i2o_device *dev, *tmp;
-		struct i2o_event *evt;
-		u16 size;
-		u16 tid = le32_to_cpu(msg->u.head[1]) & 0xfff;
-
-		osm_debug("event received from device %d\n", tid);
-
-		if (!drv->event)
-			return -EIO;
-
-		/* cut of header from message size (in 32-bit words) */
-		size = (le32_to_cpu(msg->u.head[0]) >> 16) - 5;
-
-		evt = kzalloc(size * 4 + sizeof(*evt), GFP_ATOMIC);
-		if (!evt)
-			return -ENOMEM;
-
-		evt->size = size;
-		evt->tcntxt = le32_to_cpu(msg->u.s.tcntxt);
-		evt->event_indicator = le32_to_cpu(msg->body[0]);
-		memcpy(&evt->data, &msg->body[1], size * 4);
-
-		list_for_each_entry_safe(dev, tmp, &c->devices, list)
-		    if (dev->lct_data.tid == tid) {
-			evt->i2o_dev = dev;
-			break;
-		}
-
-		INIT_WORK(&evt->work, drv->event);
-		queue_work(drv->event_queue, &evt->work);
-		return 1;
-	}
-
-	if (unlikely(!drv->reply)) {
-		osm_debug("%s: Reply to driver %s, but no reply function"
-			  " defined!\n", c->name, drv->name);
-		return -EIO;
-	}
-
-	return drv->reply(c, m, msg);
-}
-
-/**
- *	i2o_driver_notify_controller_add_all - Send notify of added controller
- *	@c: newly added controller
- *
- *	Send notifications to all registered drivers that a new controller was
- *	added.
- */
-void i2o_driver_notify_controller_add_all(struct i2o_controller *c)
-{
-	int i;
-	struct i2o_driver *drv;
-
-	for (i = 0; i < i2o_max_drivers; i++) {
-		drv = i2o_drivers[i];
-
-		if (drv)
-			i2o_driver_notify_controller_add(drv, c);
-	}
-}
-
-/**
- *	i2o_driver_notify_controller_remove_all - Send notify of removed controller
- *	@c: controller that is being removed
- *
- *	Send notifications to all registered drivers that a controller was
- *	removed.
- */
-void i2o_driver_notify_controller_remove_all(struct i2o_controller *c)
-{
-	int i;
-	struct i2o_driver *drv;
-
-	for (i = 0; i < i2o_max_drivers; i++) {
-		drv = i2o_drivers[i];
-
-		if (drv)
-			i2o_driver_notify_controller_remove(drv, c);
-	}
-}
-
-/**
- *	i2o_driver_notify_device_add_all - Send notify of added device
- *	@i2o_dev: newly added I2O device
- *
- *	Send notifications to all registered drivers that a device was added.
- */
-void i2o_driver_notify_device_add_all(struct i2o_device *i2o_dev)
-{
-	int i;
-	struct i2o_driver *drv;
-
-	for (i = 0; i < i2o_max_drivers; i++) {
-		drv = i2o_drivers[i];
-
-		if (drv)
-			i2o_driver_notify_device_add(drv, i2o_dev);
-	}
-}
-
-/**
- *	i2o_driver_notify_device_remove_all - Send notify of removed device
- *	@i2o_dev: device that is being removed
- *
- *	Send notifications to all registered drivers that a device was removed.
- */
-void i2o_driver_notify_device_remove_all(struct i2o_device *i2o_dev)
-{
-	int i;
-	struct i2o_driver *drv;
-
-	for (i = 0; i < i2o_max_drivers; i++) {
-		drv = i2o_drivers[i];
-
-		if (drv)
-			i2o_driver_notify_device_remove(drv, i2o_dev);
-	}
-}
-
-/**
- *	i2o_driver_init - initialize I2O drivers (OSMs)
- *
- *	Registers the I2O bus and allocate memory for the array of OSMs.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int __init i2o_driver_init(void)
-{
-	int rc = 0;
-
-	spin_lock_init(&i2o_drivers_lock);
-
-	if ((i2o_max_drivers < 2) || (i2o_max_drivers > 64)) {
-		osm_warn("max_drivers set to %d, but must be >=2 and <= 64\n",
-			 i2o_max_drivers);
-		i2o_max_drivers = I2O_MAX_DRIVERS;
-	}
-	osm_info("max drivers = %d\n", i2o_max_drivers);
-
-	i2o_drivers =
-	    kcalloc(i2o_max_drivers, sizeof(*i2o_drivers), GFP_KERNEL);
-	if (!i2o_drivers)
-		return -ENOMEM;
-
-	rc = bus_register(&i2o_bus_type);
-
-	if (rc < 0)
-		kfree(i2o_drivers);
-
-	return rc;
-};
-
-/**
- *	i2o_driver_exit - clean up I2O drivers (OSMs)
- *
- *	Unregisters the I2O bus and frees driver array.
- */
-void i2o_driver_exit(void)
-{
-	bus_unregister(&i2o_bus_type);
-	kfree(i2o_drivers);
-};
-
-EXPORT_SYMBOL(i2o_driver_register);
-EXPORT_SYMBOL(i2o_driver_unregister);
-EXPORT_SYMBOL(i2o_driver_notify_controller_add_all);
-EXPORT_SYMBOL(i2o_driver_notify_controller_remove_all);
-EXPORT_SYMBOL(i2o_driver_notify_device_add_all);
-EXPORT_SYMBOL(i2o_driver_notify_device_remove_all);
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
deleted file mode 100644
index a3970e56ae53..000000000000
--- a/drivers/message/i2o/exec-osm.c
+++ /dev/null
@@ -1,612 +0,0 @@
-/*
- *	Executive OSM
- *
- * 	Copyright (C) 1999-2002	Red Hat Software
- *
- *	Written by Alan Cox, Building Number Three Ltd
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	A lot of the I2O message side code from this is taken from the Red
- *	Creek RCPCI45 adapter driver by Red Creek Communications
- *
- *	Fixes/additions:
- *		Philipp Rumpf
- *		Juha Sievänen <Juha.Sievanen@cs.Helsinki.FI>
- *		Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
- *		Deepak Saxena <deepak@plexity.net>
- *		Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
- *		Alan Cox <alan@lxorguk.ukuu.org.uk>:
- *			Ported to Linux 2.5.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Minor fixes for 2.6.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Support for sysfs included.
- */
-
-#include <linux/module.h>
-#include <linux/i2o.h>
-#include <linux/delay.h>
-#include <linux/workqueue.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/sched.h>	/* wait_event_interruptible_timeout() needs this */
-#include <asm/param.h>		/* HZ */
-#include "core.h"
-
-#define OSM_NAME "exec-osm"
-
-struct i2o_driver i2o_exec_driver;
-
-/* global wait list for POST WAIT */
-static LIST_HEAD(i2o_exec_wait_list);
-
-/* Wait struct needed for POST WAIT */
-struct i2o_exec_wait {
-	wait_queue_head_t *wq;	/* Pointer to Wait queue */
-	struct i2o_dma dma;	/* DMA buffers to free on failure */
-	u32 tcntxt;		/* transaction context from reply */
-	int complete;		/* 1 if reply received otherwise 0 */
-	u32 m;			/* message id */
-	struct i2o_message *msg;	/* pointer to the reply message */
-	struct list_head list;	/* node in global wait list */
-	spinlock_t lock;	/* lock before modifying */
-};
-
-/* Work struct needed to handle LCT NOTIFY replies */
-struct i2o_exec_lct_notify_work {
-	struct work_struct work;	/* work struct */
-	struct i2o_controller *c;	/* controller on which the LCT NOTIFY
-					   was received */
-};
-
-/* Exec OSM class handling definition */
-static struct i2o_class_id i2o_exec_class_id[] = {
-	{I2O_CLASS_EXECUTIVE},
-	{I2O_CLASS_END}
-};
-
-/**
- *	i2o_exec_wait_alloc - Allocate a i2o_exec_wait struct an initialize it
- *
- *	Allocate the i2o_exec_wait struct and initialize the wait.
- *
- *	Returns i2o_exec_wait pointer on success or negative error code on
- *	failure.
- */
-static struct i2o_exec_wait *i2o_exec_wait_alloc(void)
-{
-	struct i2o_exec_wait *wait;
-
-	wait = kzalloc(sizeof(*wait), GFP_KERNEL);
-	if (!wait)
-		return NULL;
-
-	INIT_LIST_HEAD(&wait->list);
-	spin_lock_init(&wait->lock);
-
-	return wait;
-};
-
-/**
- *	i2o_exec_wait_free - Free an i2o_exec_wait struct
- *	@wait: I2O wait data which should be cleaned up
- */
-static void i2o_exec_wait_free(struct i2o_exec_wait *wait)
-{
-	kfree(wait);
-};
-
-/**
- * 	i2o_msg_post_wait_mem - Post and wait a message with DMA buffers
- *	@c: controller
- *	@msg: message to post
- *	@timeout: time in seconds to wait
- *	@dma: i2o_dma struct of the DMA buffer to free on failure
- *
- * 	This API allows an OSM to post a message and then be told whether or
- *	not the system received a successful reply. If the message times out
- *	then the value '-ETIMEDOUT' is returned. This is a special case. In
- *	this situation the message may (should) complete at an indefinite time
- *	in the future. When it completes it will use the memory buffer
- *	attached to the request. If -ETIMEDOUT is returned then the memory
- *	buffer must not be freed. Instead the event completion will free them
- *	for you. In all other cases the buffer are your problem.
- *
- *	Returns 0 on success, negative error code on timeout or positive error
- *	code from reply.
- */
-int i2o_msg_post_wait_mem(struct i2o_controller *c, struct i2o_message *msg,
-			  unsigned long timeout, struct i2o_dma *dma)
-{
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
-	struct i2o_exec_wait *wait;
-	static u32 tcntxt = 0x80000000;
-	unsigned long flags;
-	int rc = 0;
-
-	wait = i2o_exec_wait_alloc();
-	if (!wait) {
-		i2o_msg_nop(c, msg);
-		return -ENOMEM;
-	}
-
-	if (tcntxt == 0xffffffff)
-		tcntxt = 0x80000000;
-
-	if (dma)
-		wait->dma = *dma;
-
-	/*
-	 * Fill in the message initiator context and transaction context.
-	 * We will only use transaction contexts >= 0x80000000 for POST WAIT,
-	 * so we could find a POST WAIT reply easier in the reply handler.
-	 */
-	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
-	wait->tcntxt = tcntxt++;
-	msg->u.s.tcntxt = cpu_to_le32(wait->tcntxt);
-
-	wait->wq = &wq;
-	/*
-	 * we add elements to the head, because if a entry in the list will
-	 * never be removed, we have to iterate over it every time
-	 */
-	list_add(&wait->list, &i2o_exec_wait_list);
-
-	/*
-	 * Post the message to the controller. At some point later it will
-	 * return. If we time out before it returns then complete will be zero.
-	 */
-	i2o_msg_post(c, msg);
-
-	wait_event_interruptible_timeout(wq, wait->complete, timeout * HZ);
-
-	spin_lock_irqsave(&wait->lock, flags);
-
-	wait->wq = NULL;
-
-	if (wait->complete)
-		rc = le32_to_cpu(wait->msg->body[0]) >> 24;
-	else {
-		/*
-		 * We cannot remove it now. This is important. When it does
-		 * terminate (which it must do if the controller has not
-		 * died...) then it will otherwise scribble on stuff.
-		 *
-		 * FIXME: try abort message
-		 */
-		if (dma)
-			dma->virt = NULL;
-
-		rc = -ETIMEDOUT;
-	}
-
-	spin_unlock_irqrestore(&wait->lock, flags);
-
-	if (rc != -ETIMEDOUT) {
-		i2o_flush_reply(c, wait->m);
-		i2o_exec_wait_free(wait);
-	}
-
-	return rc;
-};
-
-/**
- *	i2o_msg_post_wait_complete - Reply to a i2o_msg_post request from IOP
- *	@c: I2O controller which answers
- *	@m: message id
- *	@msg: pointer to the I2O reply message
- *	@context: transaction context of request
- *
- *	This function is called in interrupt context only. If the reply reached
- *	before the timeout, the i2o_exec_wait struct is filled with the message
- *	and the task will be waked up. The task is now responsible for returning
- *	the message m back to the controller! If the message reaches us after
- *	the timeout clean up the i2o_exec_wait struct (including allocated
- *	DMA buffer).
- *
- *	Return 0 on success and if the message m should not be given back to the
- *	I2O controller, or >0 on success and if the message should be given back
- *	afterwords. Returns negative error code on failure. In this case the
- *	message must also be given back to the controller.
- */
-static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
-				      struct i2o_message *msg, u32 context)
-{
-	struct i2o_exec_wait *wait, *tmp;
-	unsigned long flags;
-	int rc = 1;
-
-	/*
-	 * We need to search through the i2o_exec_wait_list to see if the given
-	 * message is still outstanding. If not, it means that the IOP took
-	 * longer to respond to the message than we had allowed and timer has
-	 * already expired. Not much we can do about that except log it for
-	 * debug purposes, increase timeout, and recompile.
-	 */
-	list_for_each_entry_safe(wait, tmp, &i2o_exec_wait_list, list) {
-		if (wait->tcntxt == context) {
-			spin_lock_irqsave(&wait->lock, flags);
-
-			list_del(&wait->list);
-
-			wait->m = m;
-			wait->msg = msg;
-			wait->complete = 1;
-
-			if (wait->wq)
-				rc = 0;
-			else
-				rc = -1;
-
-			spin_unlock_irqrestore(&wait->lock, flags);
-
-			if (rc) {
-				struct device *dev;
-
-				dev = &c->pdev->dev;
-
-				pr_debug("%s: timedout reply received!\n",
-					 c->name);
-				i2o_dma_free(dev, &wait->dma);
-				i2o_exec_wait_free(wait);
-			} else
-				wake_up_interruptible(wait->wq);
-
-			return rc;
-		}
-	}
-
-	osm_warn("%s: Bogus reply in POST WAIT (tr-context: %08x)!\n", c->name,
-		 context);
-
-	return -1;
-};
-
-/**
- *	i2o_exec_show_vendor_id - Displays Vendor ID of controller
- *	@d: device of which the Vendor ID should be displayed
- *	@attr: device_attribute to display
- *	@buf: buffer into which the Vendor ID should be printed
- *
- *	Returns number of bytes printed into buffer.
- */
-static ssize_t i2o_exec_show_vendor_id(struct device *d,
-				       struct device_attribute *attr, char *buf)
-{
-	struct i2o_device *dev = to_i2o_device(d);
-	u16 id;
-
-	if (!i2o_parm_field_get(dev, 0x0000, 0, &id, 2)) {
-		sprintf(buf, "0x%04x", le16_to_cpu(id));
-		return strlen(buf) + 1;
-	}
-
-	return 0;
-};
-
-/**
- *	i2o_exec_show_product_id - Displays Product ID of controller
- *	@d: device of which the Product ID should be displayed
- *	@attr: device_attribute to display
- *	@buf: buffer into which the Product ID should be printed
- *
- *	Returns number of bytes printed into buffer.
- */
-static ssize_t i2o_exec_show_product_id(struct device *d,
-					struct device_attribute *attr,
-					char *buf)
-{
-	struct i2o_device *dev = to_i2o_device(d);
-	u16 id;
-
-	if (!i2o_parm_field_get(dev, 0x0000, 1, &id, 2)) {
-		sprintf(buf, "0x%04x", le16_to_cpu(id));
-		return strlen(buf) + 1;
-	}
-
-	return 0;
-};
-
-/* Exec-OSM device attributes */
-static DEVICE_ATTR(vendor_id, S_IRUGO, i2o_exec_show_vendor_id, NULL);
-static DEVICE_ATTR(product_id, S_IRUGO, i2o_exec_show_product_id, NULL);
-
-/**
- *	i2o_exec_probe - Called if a new I2O device (executive class) appears
- *	@dev: I2O device which should be probed
- *
- *	Registers event notification for every event from Executive device. The
- *	return is always 0, because we want all devices of class Executive.
- *
- *	Returns 0 on success.
- */
-static int i2o_exec_probe(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	int rc;
-
-	rc = i2o_event_register(i2o_dev, &i2o_exec_driver, 0, 0xffffffff);
-	if (rc) goto err_out;
-
-	rc = device_create_file(dev, &dev_attr_vendor_id);
-	if (rc) goto err_evtreg;
-	rc = device_create_file(dev, &dev_attr_product_id);
-	if (rc) goto err_vid;
-
-	i2o_dev->iop->exec = i2o_dev;
-
-	return 0;
-
-err_vid:
-	device_remove_file(dev, &dev_attr_vendor_id);
-err_evtreg:
-	i2o_event_register(to_i2o_device(dev), &i2o_exec_driver, 0, 0);
-err_out:
-	return rc;
-};
-
-/**
- *	i2o_exec_remove - Called on I2O device removal
- *	@dev: I2O device which was removed
- *
- *	Unregisters event notification from Executive I2O device.
- *
- *	Returns 0 on success.
- */
-static int i2o_exec_remove(struct device *dev)
-{
-	device_remove_file(dev, &dev_attr_product_id);
-	device_remove_file(dev, &dev_attr_vendor_id);
-
-	i2o_event_register(to_i2o_device(dev), &i2o_exec_driver, 0, 0);
-
-	return 0;
-};
-
-#ifdef CONFIG_I2O_LCT_NOTIFY_ON_CHANGES
-/**
- *	i2o_exec_lct_notify - Send a asynchronus LCT NOTIFY request
- *	@c: I2O controller to which the request should be send
- *	@change_ind: change indicator
- *
- *	This function sends a LCT NOTIFY request to the I2O controller with
- *	the change indicator change_ind. If the change_ind == 0 the controller
- *	replies immediately after the request. If change_ind > 0 the reply is
- *	send after change indicator of the LCT is > change_ind.
- */
-static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind)
-{
-	i2o_status_block *sb = c->status_block.virt;
-	struct device *dev;
-	struct i2o_message *msg;
-
-	mutex_lock(&c->lct_lock);
-
-	dev = &c->pdev->dev;
-
-	if (i2o_dma_realloc(dev, &c->dlct,
-					le32_to_cpu(sb->expected_lct_size))) {
-		mutex_unlock(&c->lct_lock);
-		return -ENOMEM;
-	}
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg)) {
-		mutex_unlock(&c->lct_lock);
-		return PTR_ERR(msg);
-	}
-
-	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
-	msg->u.head[1] = cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
-				     ADAPTER_TID);
-	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
-	msg->body[0] = cpu_to_le32(0xffffffff);
-	msg->body[1] = cpu_to_le32(change_ind);
-	msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
-	msg->body[3] = cpu_to_le32(c->dlct.phys);
-
-	i2o_msg_post(c, msg);
-
-	mutex_unlock(&c->lct_lock);
-
-	return 0;
-}
-#endif
-
-/**
- *	i2o_exec_lct_modified - Called on LCT NOTIFY reply
- *	@_work: work struct for a specific controller
- *
- *	This function handles asynchronus LCT NOTIFY replies. It parses the
- *	new LCT and if the buffer for the LCT was to small sends a LCT NOTIFY
- *	again, otherwise send LCT NOTIFY to get informed on next LCT change.
- */
-static void i2o_exec_lct_modified(struct work_struct *_work)
-{
-	struct i2o_exec_lct_notify_work *work =
-		container_of(_work, struct i2o_exec_lct_notify_work, work);
-	u32 change_ind = 0;
-	struct i2o_controller *c = work->c;
-
-	kfree(work);
-
-	if (i2o_device_parse_lct(c) != -EAGAIN)
-		change_ind = c->lct->change_ind + 1;
-
-#ifdef CONFIG_I2O_LCT_NOTIFY_ON_CHANGES
-	i2o_exec_lct_notify(c, change_ind);
-#endif
-};
-
-/**
- *	i2o_exec_reply -  I2O Executive reply handler
- *	@c: I2O controller from which the reply comes
- *	@m: message id
- *	@msg: pointer to the I2O reply message
- *
- *	This function is always called from interrupt context. If a POST WAIT
- *	reply was received, pass it to the complete function. If a LCT NOTIFY
- *	reply was received, a new event is created to handle the update.
- *
- *	Returns 0 on success and if the reply should not be flushed or > 0
- *	on success and if the reply should be flushed. Returns negative error
- *	code on failure and if the reply should be flushed.
- */
-static int i2o_exec_reply(struct i2o_controller *c, u32 m,
-			  struct i2o_message *msg)
-{
-	u32 context;
-
-	if (le32_to_cpu(msg->u.head[0]) & MSG_FAIL) {
-		struct i2o_message __iomem *pmsg;
-		u32 pm;
-
-		/*
-		 * If Fail bit is set we must take the transaction context of
-		 * the preserved message to find the right request again.
-		 */
-
-		pm = le32_to_cpu(msg->body[3]);
-		pmsg = i2o_msg_in_to_virt(c, pm);
-		context = readl(&pmsg->u.s.tcntxt);
-
-		i2o_report_status(KERN_INFO, "i2o_core", msg);
-
-		/* Release the preserved msg */
-		i2o_msg_nop_mfa(c, pm);
-	} else
-		context = le32_to_cpu(msg->u.s.tcntxt);
-
-	if (context & 0x80000000)
-		return i2o_msg_post_wait_complete(c, m, msg, context);
-
-	if ((le32_to_cpu(msg->u.head[1]) >> 24) == I2O_CMD_LCT_NOTIFY) {
-		struct i2o_exec_lct_notify_work *work;
-
-		pr_debug("%s: LCT notify received\n", c->name);
-
-		work = kmalloc(sizeof(*work), GFP_ATOMIC);
-		if (!work)
-			return -ENOMEM;
-
-		work->c = c;
-
-		INIT_WORK(&work->work, i2o_exec_lct_modified);
-		queue_work(i2o_exec_driver.event_queue, &work->work);
-		return 1;
-	}
-
-	/*
-	 * If this happens, we want to dump the message to the syslog so
-	 * it can be sent back to the card manufacturer by the end user
-	 * to aid in debugging.
-	 *
-	 */
-	printk(KERN_WARNING "%s: Unsolicited message reply sent to core!"
-	       "Message dumped to syslog\n", c->name);
-	i2o_dump_message(msg);
-
-	return -EFAULT;
-}
-
-/**
- *	i2o_exec_event - Event handling function
- *	@work: Work item in occurring event
- *
- *	Handles events send by the Executive device. At the moment does not do
- *	anything useful.
- */
-static void i2o_exec_event(struct work_struct *work)
-{
-	struct i2o_event *evt = container_of(work, struct i2o_event, work);
-
-	if (likely(evt->i2o_dev))
-		osm_debug("Event received from device: %d\n",
-			  evt->i2o_dev->lct_data.tid);
-	kfree(evt);
-};
-
-/**
- *	i2o_exec_lct_get - Get the IOP's Logical Configuration Table
- *	@c: I2O controller from which the LCT should be fetched
- *
- *	Send a LCT NOTIFY request to the controller, and wait
- *	I2O_TIMEOUT_LCT_GET seconds until arrival of response. If the LCT is
- *	to large, retry it.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_exec_lct_get(struct i2o_controller *c)
-{
-	struct i2o_message *msg;
-	int i = 0;
-	int rc = -EAGAIN;
-
-	for (i = 1; i <= I2O_LCT_GET_TRIES; i++) {
-		msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-		if (IS_ERR(msg))
-			return PTR_ERR(msg);
-
-		msg->u.head[0] =
-		    cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
-		msg->u.head[1] =
-		    cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
-				ADAPTER_TID);
-		msg->body[0] = cpu_to_le32(0xffffffff);
-		msg->body[1] = cpu_to_le32(0x00000000);
-		msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
-		msg->body[3] = cpu_to_le32(c->dlct.phys);
-
-		rc = i2o_msg_post_wait(c, msg, I2O_TIMEOUT_LCT_GET);
-		if (rc < 0)
-			break;
-
-		rc = i2o_device_parse_lct(c);
-		if (rc != -EAGAIN)
-			break;
-	}
-
-	return rc;
-}
-
-/* Exec OSM driver struct */
-struct i2o_driver i2o_exec_driver = {
-	.name = OSM_NAME,
-	.reply = i2o_exec_reply,
-	.event = i2o_exec_event,
-	.classes = i2o_exec_class_id,
-	.driver = {
-		   .probe = i2o_exec_probe,
-		   .remove = i2o_exec_remove,
-		   },
-};
-
-/**
- *	i2o_exec_init - Registers the Exec OSM
- *
- *	Registers the Exec OSM in the I2O core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int __init i2o_exec_init(void)
-{
-	return i2o_driver_register(&i2o_exec_driver);
-};
-
-/**
- *	i2o_exec_exit - Removes the Exec OSM
- *
- *	Unregisters the Exec OSM from the I2O core.
- */
-void i2o_exec_exit(void)
-{
-	i2o_driver_unregister(&i2o_exec_driver);
-};
-
-EXPORT_SYMBOL(i2o_msg_post_wait_mem);
-EXPORT_SYMBOL(i2o_exec_lct_get);
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
deleted file mode 100644
index 6fc3866965df..000000000000
--- a/drivers/message/i2o/i2o_block.c
+++ /dev/null
@@ -1,1228 +0,0 @@
-/*
- *	Block OSM
- *
- * 	Copyright (C) 1999-2002	Red Hat Software
- *
- *	Written by Alan Cox, Building Number Three Ltd
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	This program is distributed in the hope that it will be useful, but
- *	WITHOUT ANY WARRANTY; without even the implied warranty of
- *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *	General Public License for more details.
- *
- *	For the purpose of avoiding doubt the preferred form of the work
- *	for making modifications shall be a standards compliant form such
- *	gzipped tar and not one requiring a proprietary or patent encumbered
- *	tool to unpack.
- *
- *	Fixes/additions:
- *		Steve Ralston:
- *			Multiple device handling error fixes,
- *			Added a queue depth.
- *		Alan Cox:
- *			FC920 has an rmw bug. Dont or in the end marker.
- *			Removed queue walk, fixed for 64bitness.
- *			Rewrote much of the code over time
- *			Added indirect block lists
- *			Handle 64K limits on many controllers
- *			Don't use indirects on the Promise (breaks)
- *			Heavily chop down the queue depths
- *		Deepak Saxena:
- *			Independent queues per IOP
- *			Support for dynamic device creation/deletion
- *			Code cleanup
- *	    		Support for larger I/Os through merge* functions
- *			(taken from DAC960 driver)
- *		Boji T Kannanthanam:
- *			Set the I2O Block devices to be detected in increasing
- *			order of TIDs during boot.
- *			Search and set the I2O block device that we boot off
- *			from as the first device to be claimed (as /dev/i2o/hda)
- *			Properly attach/detach I2O gendisk structure from the
- *			system gendisk list. The I2O block devices now appear in
- *			/proc/partitions.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Minor bugfixes for 2.6.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/i2o.h>
-#include <linux/mutex.h>
-
-#include <linux/mempool.h>
-
-#include <linux/genhd.h>
-#include <linux/blkdev.h>
-#include <linux/hdreg.h>
-
-#include <scsi/scsi.h>
-
-#include "i2o_block.h"
-
-#define OSM_NAME	"block-osm"
-#define OSM_VERSION	"1.325"
-#define OSM_DESCRIPTION	"I2O Block Device OSM"
-
-static DEFINE_MUTEX(i2o_block_mutex);
-static struct i2o_driver i2o_block_driver;
-
-/* global Block OSM request mempool */
-static struct i2o_block_mempool i2o_blk_req_pool;
-
-/* Block OSM class handling definition */
-static struct i2o_class_id i2o_block_class_id[] = {
-	{I2O_CLASS_RANDOM_BLOCK_STORAGE},
-	{I2O_CLASS_END}
-};
-
-/**
- *	i2o_block_device_free - free the memory of the I2O Block device
- *	@dev: I2O Block device, which should be cleaned up
- *
- *	Frees the request queue, gendisk and the i2o_block_device structure.
- */
-static void i2o_block_device_free(struct i2o_block_device *dev)
-{
-	blk_cleanup_queue(dev->gd->queue);
-
-	put_disk(dev->gd);
-
-	kfree(dev);
-};
-
-/**
- *	i2o_block_remove - remove the I2O Block device from the system again
- *	@dev: I2O Block device which should be removed
- *
- *	Remove gendisk from system and free all allocated memory.
- *
- *	Always returns 0.
- */
-static int i2o_block_remove(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	struct i2o_block_device *i2o_blk_dev = dev_get_drvdata(dev);
-
-	osm_info("device removed (TID: %03x): %s\n", i2o_dev->lct_data.tid,
-		 i2o_blk_dev->gd->disk_name);
-
-	i2o_event_register(i2o_dev, &i2o_block_driver, 0, 0);
-
-	del_gendisk(i2o_blk_dev->gd);
-
-	dev_set_drvdata(dev, NULL);
-
-	i2o_device_claim_release(i2o_dev);
-
-	i2o_block_device_free(i2o_blk_dev);
-
-	return 0;
-};
-
-/**
- *	i2o_block_device flush - Flush all dirty data of I2O device dev
- *	@dev: I2O device which should be flushed
- *
- *	Flushes all dirty data on device dev.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_device_flush(struct i2o_device *dev)
-{
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_BLOCK_CFLUSH << 24 | HOST_TID << 12 | dev->
-			lct_data.tid);
-	msg->body[0] = cpu_to_le32(60 << 16);
-	osm_debug("Flushing...\n");
-
-	return i2o_msg_post_wait(dev->iop, msg, 60);
-};
-
-/**
- *	i2o_block_device_mount - Mount (load) the media of device dev
- *	@dev: I2O device which should receive the mount request
- *	@media_id: Media Identifier
- *
- *	Load a media into drive. Identifier should be set to -1, because the
- *	spec does not support any other value.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_device_mount(struct i2o_device *dev, u32 media_id)
-{
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_BLOCK_MMOUNT << 24 | HOST_TID << 12 | dev->
-			lct_data.tid);
-	msg->body[0] = cpu_to_le32(-1);
-	msg->body[1] = cpu_to_le32(0x00000000);
-	osm_debug("Mounting...\n");
-
-	return i2o_msg_post_wait(dev->iop, msg, 2);
-};
-
-/**
- *	i2o_block_device_lock - Locks the media of device dev
- *	@dev: I2O device which should receive the lock request
- *	@media_id: Media Identifier
- *
- *	Lock media of device dev to prevent removal. The media identifier
- *	should be set to -1, because the spec does not support any other value.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_device_lock(struct i2o_device *dev, u32 media_id)
-{
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_BLOCK_MLOCK << 24 | HOST_TID << 12 | dev->
-			lct_data.tid);
-	msg->body[0] = cpu_to_le32(-1);
-	osm_debug("Locking...\n");
-
-	return i2o_msg_post_wait(dev->iop, msg, 2);
-};
-
-/**
- *	i2o_block_device_unlock - Unlocks the media of device dev
- *	@dev: I2O device which should receive the unlocked request
- *	@media_id: Media Identifier
- *
- *	Unlocks the media in device dev. The media identifier should be set to
- *	-1, because the spec does not support any other value.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_device_unlock(struct i2o_device *dev, u32 media_id)
-{
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_BLOCK_MUNLOCK << 24 | HOST_TID << 12 | dev->
-			lct_data.tid);
-	msg->body[0] = cpu_to_le32(media_id);
-	osm_debug("Unlocking...\n");
-
-	return i2o_msg_post_wait(dev->iop, msg, 2);
-};
-
-/**
- *	i2o_block_device_power - Power management for device dev
- *	@dev: I2O device which should receive the power management request
- *	@op: Operation to send
- *
- *	Send a power management request to the device dev.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_device_power(struct i2o_block_device *dev, u8 op)
-{
-	struct i2o_device *i2o_dev = dev->i2o_dev;
-	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_message *msg;
-	int rc;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_BLOCK_POWER << 24 | HOST_TID << 12 | i2o_dev->
-			lct_data.tid);
-	msg->body[0] = cpu_to_le32(op << 24);
-	osm_debug("Power...\n");
-
-	rc = i2o_msg_post_wait(c, msg, 60);
-	if (!rc)
-		dev->power = op;
-
-	return rc;
-};
-
-/**
- *	i2o_block_request_alloc - Allocate an I2O block request struct
- *
- *	Allocates an I2O block request struct and initialize the list.
- *
- *	Returns a i2o_block_request pointer on success or negative error code
- *	on failure.
- */
-static inline struct i2o_block_request *i2o_block_request_alloc(void)
-{
-	struct i2o_block_request *ireq;
-
-	ireq = mempool_alloc(i2o_blk_req_pool.pool, GFP_ATOMIC);
-	if (!ireq)
-		return ERR_PTR(-ENOMEM);
-
-	INIT_LIST_HEAD(&ireq->queue);
-	sg_init_table(ireq->sg_table, I2O_MAX_PHYS_SEGMENTS);
-
-	return ireq;
-};
-
-/**
- *	i2o_block_request_free - Frees a I2O block request
- *	@ireq: I2O block request which should be freed
- *
- *	Frees the allocated memory (give it back to the request mempool).
- */
-static inline void i2o_block_request_free(struct i2o_block_request *ireq)
-{
-	mempool_free(ireq, i2o_blk_req_pool.pool);
-};
-
-/**
- *	i2o_block_sglist_alloc - Allocate the SG list and map it
- *	@c: I2O controller to which the request belongs
- *	@ireq: I2O block request
- *	@mptr: message body pointer
- *
- *	Builds the SG list and map it to be accessible by the controller.
- *
- *	Returns 0 on failure or 1 on success.
- */
-static inline int i2o_block_sglist_alloc(struct i2o_controller *c,
-					 struct i2o_block_request *ireq,
-					 u32 ** mptr)
-{
-	int nents;
-	enum dma_data_direction direction;
-
-	ireq->dev = &c->pdev->dev;
-	nents = blk_rq_map_sg(ireq->req->q, ireq->req, ireq->sg_table);
-
-	if (rq_data_dir(ireq->req) == READ)
-		direction = PCI_DMA_FROMDEVICE;
-	else
-		direction = PCI_DMA_TODEVICE;
-
-	ireq->sg_nents = nents;
-
-	return i2o_dma_map_sg(c, ireq->sg_table, nents, direction, mptr);
-};
-
-/**
- *	i2o_block_sglist_free - Frees the SG list
- *	@ireq: I2O block request from which the SG should be freed
- *
- *	Frees the SG list from the I2O block request.
- */
-static inline void i2o_block_sglist_free(struct i2o_block_request *ireq)
-{
-	enum dma_data_direction direction;
-
-	if (rq_data_dir(ireq->req) == READ)
-		direction = PCI_DMA_FROMDEVICE;
-	else
-		direction = PCI_DMA_TODEVICE;
-
-	dma_unmap_sg(ireq->dev, ireq->sg_table, ireq->sg_nents, direction);
-};
-
-/**
- *	i2o_block_prep_req_fn - Allocates I2O block device specific struct
- *	@q: request queue for the request
- *	@req: the request to prepare
- *
- *	Allocate the necessary i2o_block_request struct and connect it to
- *	the request. This is needed that we not lose the SG list later on.
- *
- *	Returns BLKPREP_OK on success or BLKPREP_DEFER on failure.
- */
-static int i2o_block_prep_req_fn(struct request_queue *q, struct request *req)
-{
-	struct i2o_block_device *i2o_blk_dev = q->queuedata;
-	struct i2o_block_request *ireq;
-
-	if (unlikely(!i2o_blk_dev)) {
-		osm_err("block device already removed\n");
-		return BLKPREP_KILL;
-	}
-
-	/* connect the i2o_block_request to the request */
-	if (!req->special) {
-		ireq = i2o_block_request_alloc();
-		if (IS_ERR(ireq)) {
-			osm_debug("unable to allocate i2o_block_request!\n");
-			return BLKPREP_DEFER;
-		}
-
-		ireq->i2o_blk_dev = i2o_blk_dev;
-		req->special = ireq;
-		ireq->req = req;
-	}
-	/* do not come back here */
-	req->cmd_flags |= REQ_DONTPREP;
-
-	return BLKPREP_OK;
-};
-
-/**
- *	i2o_block_delayed_request_fn - delayed request queue function
- *	@work: the delayed request with the queue to start
- *
- *	If the request queue is stopped for a disk, and there is no open
- *	request, a new event is created, which calls this function to start
- *	the queue after I2O_BLOCK_REQUEST_TIME. Otherwise the queue will never
- *	be started again.
- */
-static void i2o_block_delayed_request_fn(struct work_struct *work)
-{
-	struct i2o_block_delayed_request *dreq =
-		container_of(work, struct i2o_block_delayed_request,
-			     work.work);
-	struct request_queue *q = dreq->queue;
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_start_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-	kfree(dreq);
-};
-
-/**
- *	i2o_block_end_request - Post-processing of completed commands
- *	@req: request which should be completed
- *	@error: 0 for success, < 0 for error
- *	@nr_bytes: number of bytes to complete
- *
- *	Mark the request as complete. The lock must not be held when entering.
- *
- */
-static void i2o_block_end_request(struct request *req, int error,
-				  int nr_bytes)
-{
-	struct i2o_block_request *ireq = req->special;
-	struct i2o_block_device *dev = ireq->i2o_blk_dev;
-	struct request_queue *q = req->q;
-	unsigned long flags;
-
-	if (blk_end_request(req, error, nr_bytes))
-		if (error)
-			blk_end_request_all(req, -EIO);
-
-	spin_lock_irqsave(q->queue_lock, flags);
-
-	if (likely(dev)) {
-		dev->open_queue_depth--;
-		list_del(&ireq->queue);
-	}
-
-	blk_start_queue(q);
-
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	i2o_block_sglist_free(ireq);
-	i2o_block_request_free(ireq);
-};
-
-/**
- *	i2o_block_reply - Block OSM reply handler.
- *	@c: I2O controller from which the message arrives
- *	@m: message id of reply
- *	@msg: the actual I2O message reply
- *
- *	This function gets all the message replies.
- *
- */
-static int i2o_block_reply(struct i2o_controller *c, u32 m,
-			   struct i2o_message *msg)
-{
-	struct request *req;
-	int error = 0;
-
-	req = i2o_cntxt_list_get(c, le32_to_cpu(msg->u.s.tcntxt));
-	if (unlikely(!req)) {
-		osm_err("NULL reply received!\n");
-		return -1;
-	}
-
-	/*
-	 *      Lets see what is cooking. We stuffed the
-	 *      request in the context.
-	 */
-
-	if ((le32_to_cpu(msg->body[0]) >> 24) != 0) {
-		u32 status = le32_to_cpu(msg->body[0]);
-		/*
-		 *      Device not ready means two things. One is that the
-		 *      the thing went offline (but not a removal media)
-		 *
-		 *      The second is that you have a SuperTrak 100 and the
-		 *      firmware got constipated. Unlike standard i2o card
-		 *      setups the supertrak returns an error rather than
-		 *      blocking for the timeout in these cases.
-		 *
-		 *      Don't stick a supertrak100 into cache aggressive modes
-		 */
-
-		osm_err("TID %03x error status: 0x%02x, detailed status: "
-			"0x%04x\n", (le32_to_cpu(msg->u.head[1]) >> 12 & 0xfff),
-			status >> 24, status & 0xffff);
-
-		req->errors++;
-
-		error = -EIO;
-	}
-
-	i2o_block_end_request(req, error, le32_to_cpu(msg->body[1]));
-
-	return 1;
-};
-
-static void i2o_block_event(struct work_struct *work)
-{
-	struct i2o_event *evt = container_of(work, struct i2o_event, work);
-	osm_debug("event received\n");
-	kfree(evt);
-};
-
-/*
- *	SCSI-CAM for ioctl geometry mapping
- *	Duplicated with SCSI - this should be moved into somewhere common
- *	perhaps genhd ?
- *
- * LBA -> CHS mapping table taken from:
- *
- * "Incorporating the I2O Architecture into BIOS for Intel Architecture
- *  Platforms"
- *
- * This is an I2O document that is only available to I2O members,
- * not developers.
- *
- * From my understanding, this is how all the I2O cards do this
- *
- * Disk Size      | Sectors | Heads | Cylinders
- * ---------------+---------+-------+-------------------
- * 1 < X <= 528M  | 63      | 16    | X/(63 * 16 * 512)
- * 528M < X <= 1G | 63      | 32    | X/(63 * 32 * 512)
- * 1 < X <528M    | 63      | 16    | X/(63 * 16 * 512)
- * 1 < X <528M    | 63      | 16    | X/(63 * 16 * 512)
- *
- */
-#define	BLOCK_SIZE_528M		1081344
-#define	BLOCK_SIZE_1G		2097152
-#define	BLOCK_SIZE_21G		4403200
-#define	BLOCK_SIZE_42G		8806400
-#define	BLOCK_SIZE_84G		17612800
-
-static void i2o_block_biosparam(unsigned long capacity, unsigned short *cyls,
-				unsigned char *hds, unsigned char *secs)
-{
-	unsigned long heads, sectors, cylinders;
-
-	sectors = 63L;		/* Maximize sectors per track */
-	if (capacity <= BLOCK_SIZE_528M)
-		heads = 16;
-	else if (capacity <= BLOCK_SIZE_1G)
-		heads = 32;
-	else if (capacity <= BLOCK_SIZE_21G)
-		heads = 64;
-	else if (capacity <= BLOCK_SIZE_42G)
-		heads = 128;
-	else
-		heads = 255;
-
-	cylinders = (unsigned long)capacity / (heads * sectors);
-
-	*cyls = (unsigned short)cylinders;	/* Stuff return values */
-	*secs = (unsigned char)sectors;
-	*hds = (unsigned char)heads;
-}
-
-/**
- *	i2o_block_open - Open the block device
- *	@bdev: block device being opened
- *	@mode: file open mode
- *
- *	Power up the device, mount and lock the media. This function is called,
- *	if the block device is opened for access.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_open(struct block_device *bdev, fmode_t mode)
-{
-	struct i2o_block_device *dev = bdev->bd_disk->private_data;
-
-	if (!dev->i2o_dev)
-		return -ENODEV;
-
-	mutex_lock(&i2o_block_mutex);
-	if (dev->power > 0x1f)
-		i2o_block_device_power(dev, 0x02);
-
-	i2o_block_device_mount(dev->i2o_dev, -1);
-
-	i2o_block_device_lock(dev->i2o_dev, -1);
-
-	osm_debug("Ready.\n");
-	mutex_unlock(&i2o_block_mutex);
-
-	return 0;
-};
-
-/**
- *	i2o_block_release - Release the I2O block device
- *	@disk: gendisk device being released
- *	@mode: file open mode
- *
- *	Unlock and unmount the media, and power down the device. Gets called if
- *	the block device is closed.
- */
-static void i2o_block_release(struct gendisk *disk, fmode_t mode)
-{
-	struct i2o_block_device *dev = disk->private_data;
-	u8 operation;
-
-	/*
-	 * This is to deal with the case of an application
-	 * opening a device and then the device disappears while
-	 * it's in use, and then the application tries to release
-	 * it.  ex: Unmounting a deleted RAID volume at reboot.
-	 * If we send messages, it will just cause FAILs since
-	 * the TID no longer exists.
-	 */
-	if (!dev->i2o_dev)
-		return;
-
-	mutex_lock(&i2o_block_mutex);
-	i2o_block_device_flush(dev->i2o_dev);
-
-	i2o_block_device_unlock(dev->i2o_dev, -1);
-
-	if (dev->flags & (1 << 3 | 1 << 4))	/* Removable */
-		operation = 0x21;
-	else
-		operation = 0x24;
-
-	i2o_block_device_power(dev, operation);
-	mutex_unlock(&i2o_block_mutex);
-}
-
-static int i2o_block_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	i2o_block_biosparam(get_capacity(bdev->bd_disk),
-			    &geo->cylinders, &geo->heads, &geo->sectors);
-	return 0;
-}
-
-/**
- *	i2o_block_ioctl - Issue device specific ioctl calls.
- *	@bdev: block device being opened
- *	@mode: file open mode
- *	@cmd: ioctl command
- *	@arg: arg
- *
- *	Handles ioctl request for the block device.
- *
- *	Return 0 on success or negative error on failure.
- */
-static int i2o_block_ioctl(struct block_device *bdev, fmode_t mode,
-			   unsigned int cmd, unsigned long arg)
-{
-	struct gendisk *disk = bdev->bd_disk;
-	struct i2o_block_device *dev = disk->private_data;
-	int ret = -ENOTTY;
-
-	/* Anyone capable of this syscall can do *real bad* things */
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	mutex_lock(&i2o_block_mutex);
-	switch (cmd) {
-	case BLKI2OGRSTRAT:
-		ret = put_user(dev->rcache, (int __user *)arg);
-		break;
-	case BLKI2OGWSTRAT:
-		ret = put_user(dev->wcache, (int __user *)arg);
-		break;
-	case BLKI2OSRSTRAT:
-		ret = -EINVAL;
-		if (arg < 0 || arg > CACHE_SMARTFETCH)
-			break;
-		dev->rcache = arg;
-		ret = 0;
-		break;
-	case BLKI2OSWSTRAT:
-		ret = -EINVAL;
-		if (arg != 0
-		    && (arg < CACHE_WRITETHROUGH || arg > CACHE_SMARTBACK))
-			break;
-		dev->wcache = arg;
-		ret = 0;
-		break;
-	}
-	mutex_unlock(&i2o_block_mutex);
-
-	return ret;
-};
-
-/**
- *	i2o_block_check_events - Have we seen a media change?
- *	@disk: gendisk which should be verified
- *	@clearing: events being cleared
- *
- *	Verifies if the media has changed.
- *
- *	Returns 1 if the media was changed or 0 otherwise.
- */
-static unsigned int i2o_block_check_events(struct gendisk *disk,
-					   unsigned int clearing)
-{
-	struct i2o_block_device *p = disk->private_data;
-
-	if (p->media_change_flag) {
-		p->media_change_flag = 0;
-		return DISK_EVENT_MEDIA_CHANGE;
-	}
-	return 0;
-}
-
-/**
- *	i2o_block_transfer - Transfer a request to/from the I2O controller
- *	@req: the request which should be transferred
- *
- *	This function converts the request into a I2O message. The necessary
- *	DMA buffers are allocated and after everything is setup post the message
- *	to the I2O controller. No cleanup is done by this function. It is done
- *	on the interrupt side when the reply arrives.
- *
- *	Return 0 on success or negative error code on failure.
- */
-static int i2o_block_transfer(struct request *req)
-{
-	struct i2o_block_device *dev = req->rq_disk->private_data;
-	struct i2o_controller *c;
-	u32 tid;
-	struct i2o_message *msg;
-	u32 *mptr;
-	struct i2o_block_request *ireq = req->special;
-	u32 tcntxt;
-	u32 sgl_offset = SGL_OFFSET_8;
-	u32 ctl_flags = 0x00000000;
-	int rc;
-	u32 cmd;
-
-	if (unlikely(!dev->i2o_dev)) {
-		osm_err("transfer to removed drive\n");
-		rc = -ENODEV;
-		goto exit;
-	}
-
-	tid = dev->i2o_dev->lct_data.tid;
-	c = dev->i2o_dev->iop;
-
-	msg = i2o_msg_get(c);
-	if (IS_ERR(msg)) {
-		rc = PTR_ERR(msg);
-		goto exit;
-	}
-
-	tcntxt = i2o_cntxt_list_add(c, req);
-	if (!tcntxt) {
-		rc = -ENOMEM;
-		goto nop_msg;
-	}
-
-	msg->u.s.icntxt = cpu_to_le32(i2o_block_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(tcntxt);
-
-	mptr = &msg->body[0];
-
-	if (rq_data_dir(req) == READ) {
-		cmd = I2O_CMD_BLOCK_READ << 24;
-
-		switch (dev->rcache) {
-		case CACHE_PREFETCH:
-			ctl_flags = 0x201F0008;
-			break;
-
-		case CACHE_SMARTFETCH:
-			if (blk_rq_sectors(req) > 16)
-				ctl_flags = 0x201F0008;
-			else
-				ctl_flags = 0x001F0000;
-			break;
-
-		default:
-			break;
-		}
-	} else {
-		cmd = I2O_CMD_BLOCK_WRITE << 24;
-
-		switch (dev->wcache) {
-		case CACHE_WRITETHROUGH:
-			ctl_flags = 0x001F0008;
-			break;
-		case CACHE_WRITEBACK:
-			ctl_flags = 0x001F0010;
-			break;
-		case CACHE_SMARTBACK:
-			if (blk_rq_sectors(req) > 16)
-				ctl_flags = 0x001F0004;
-			else
-				ctl_flags = 0x001F0010;
-			break;
-		case CACHE_SMARTTHROUGH:
-			if (blk_rq_sectors(req) > 16)
-				ctl_flags = 0x001F0004;
-			else
-				ctl_flags = 0x001F0010;
-		default:
-			break;
-		}
-	}
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-	if (c->adaptec) {
-		u8 cmd[10];
-		u32 scsi_flags;
-		u16 hwsec;
-
-		hwsec = queue_logical_block_size(req->q) >> KERNEL_SECTOR_SHIFT;
-		memset(cmd, 0, 10);
-
-		sgl_offset = SGL_OFFSET_12;
-
-		msg->u.head[1] =
-		    cpu_to_le32(I2O_CMD_PRIVATE << 24 | HOST_TID << 12 | tid);
-
-		*mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
-		*mptr++ = cpu_to_le32(tid);
-
-		/*
-		 * ENABLE_DISCONNECT
-		 * SIMPLE_TAG
-		 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME
-		 */
-		if (rq_data_dir(req) == READ) {
-			cmd[0] = READ_10;
-			scsi_flags = 0x60a0000a;
-		} else {
-			cmd[0] = WRITE_10;
-			scsi_flags = 0xa0a0000a;
-		}
-
-		*mptr++ = cpu_to_le32(scsi_flags);
-
-		*((u32 *) & cmd[2]) = cpu_to_be32(blk_rq_pos(req) * hwsec);
-		*((u16 *) & cmd[7]) = cpu_to_be16(blk_rq_sectors(req) * hwsec);
-
-		memcpy(mptr, cmd, 10);
-		mptr += 4;
-		*mptr++ = cpu_to_le32(blk_rq_bytes(req));
-	} else
-#endif
-	{
-		msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
-		*mptr++ = cpu_to_le32(ctl_flags);
-		*mptr++ = cpu_to_le32(blk_rq_bytes(req));
-		*mptr++ =
-		    cpu_to_le32((u32) (blk_rq_pos(req) << KERNEL_SECTOR_SHIFT));
-		*mptr++ =
-		    cpu_to_le32(blk_rq_pos(req) >> (32 - KERNEL_SECTOR_SHIFT));
-	}
-
-	if (!i2o_block_sglist_alloc(c, ireq, &mptr)) {
-		rc = -ENOMEM;
-		goto context_remove;
-	}
-
-	msg->u.head[0] =
-	    cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
-
-	list_add_tail(&ireq->queue, &dev->open_queue);
-	dev->open_queue_depth++;
-
-	i2o_msg_post(c, msg);
-
-	return 0;
-
-      context_remove:
-	i2o_cntxt_list_remove(c, req);
-
-      nop_msg:
-	i2o_msg_nop(c, msg);
-
-      exit:
-	return rc;
-};
-
-/**
- *	i2o_block_request_fn - request queue handling function
- *	@q: request queue from which the request could be fetched
- *
- *	Takes the next request from the queue, transfers it and if no error
- *	occurs dequeue it from the queue. On arrival of the reply the message
- *	will be processed further. If an error occurs requeue the request.
- */
-static void i2o_block_request_fn(struct request_queue *q)
-{
-	struct request *req;
-
-	while ((req = blk_peek_request(q)) != NULL) {
-		if (req->cmd_type == REQ_TYPE_FS) {
-			struct i2o_block_delayed_request *dreq;
-			struct i2o_block_request *ireq = req->special;
-			unsigned int queue_depth;
-
-			queue_depth = ireq->i2o_blk_dev->open_queue_depth;
-
-			if (queue_depth < I2O_BLOCK_MAX_OPEN_REQUESTS) {
-				if (!i2o_block_transfer(req)) {
-					blk_start_request(req);
-					continue;
-				} else
-					osm_info("transfer error\n");
-			}
-
-			if (queue_depth)
-				break;
-
-			/* stop the queue and retry later */
-			dreq = kmalloc(sizeof(*dreq), GFP_ATOMIC);
-			if (!dreq)
-				continue;
-
-			dreq->queue = q;
-			INIT_DELAYED_WORK(&dreq->work,
-					  i2o_block_delayed_request_fn);
-
-			if (!queue_delayed_work(i2o_block_driver.event_queue,
-						&dreq->work,
-						I2O_BLOCK_RETRY_TIME))
-				kfree(dreq);
-			else {
-				blk_stop_queue(q);
-				break;
-			}
-		} else {
-			blk_start_request(req);
-			__blk_end_request_all(req, -EIO);
-		}
-	}
-};
-
-/* I2O Block device operations definition */
-static const struct block_device_operations i2o_block_fops = {
-	.owner = THIS_MODULE,
-	.open = i2o_block_open,
-	.release = i2o_block_release,
-	.ioctl = i2o_block_ioctl,
-	.compat_ioctl = i2o_block_ioctl,
-	.getgeo = i2o_block_getgeo,
-	.check_events = i2o_block_check_events,
-};
-
-/**
- *	i2o_block_device_alloc - Allocate memory for a I2O Block device
- *
- *	Allocate memory for the i2o_block_device struct, gendisk and request
- *	queue and initialize them as far as no additional information is needed.
- *
- *	Returns a pointer to the allocated I2O Block device on success or a
- *	negative error code on failure.
- */
-static struct i2o_block_device *i2o_block_device_alloc(void)
-{
-	struct i2o_block_device *dev;
-	struct gendisk *gd;
-	struct request_queue *queue;
-	int rc;
-
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev) {
-		osm_err("Insufficient memory to allocate I2O Block disk.\n");
-		rc = -ENOMEM;
-		goto exit;
-	}
-
-	INIT_LIST_HEAD(&dev->open_queue);
-	spin_lock_init(&dev->lock);
-	dev->rcache = CACHE_PREFETCH;
-	dev->wcache = CACHE_WRITEBACK;
-
-	/* allocate a gendisk with 16 partitions */
-	gd = alloc_disk(16);
-	if (!gd) {
-		osm_err("Insufficient memory to allocate gendisk.\n");
-		rc = -ENOMEM;
-		goto cleanup_dev;
-	}
-
-	/* initialize the request queue */
-	queue = blk_init_queue(i2o_block_request_fn, &dev->lock);
-	if (!queue) {
-		osm_err("Insufficient memory to allocate request queue.\n");
-		rc = -ENOMEM;
-		goto cleanup_queue;
-	}
-
-	blk_queue_prep_rq(queue, i2o_block_prep_req_fn);
-
-	gd->major = I2O_MAJOR;
-	gd->queue = queue;
-	gd->fops = &i2o_block_fops;
-	gd->private_data = dev;
-
-	dev->gd = gd;
-
-	return dev;
-
-      cleanup_queue:
-	put_disk(gd);
-
-      cleanup_dev:
-	kfree(dev);
-
-      exit:
-	return ERR_PTR(rc);
-};
-
-/**
- *	i2o_block_probe - verify if dev is a I2O Block device and install it
- *	@dev: device to verify if it is a I2O Block device
- *
- *	We only verify if the user_tid of the device is 0xfff and then install
- *	the device. Otherwise it is used by some other device (e. g. RAID).
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_probe(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_block_device *i2o_blk_dev;
-	struct gendisk *gd;
-	struct request_queue *queue;
-	static int unit = 0;
-	int rc;
-	u64 size;
-	u32 blocksize;
-	u16 body_size = 4;
-	u16 power;
-	unsigned short max_sectors;
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-	if (c->adaptec)
-		body_size = 8;
-#endif
-
-	if (c->limit_sectors)
-		max_sectors = I2O_MAX_SECTORS_LIMITED;
-	else
-		max_sectors = I2O_MAX_SECTORS;
-
-	/* skip devices which are used by IOP */
-	if (i2o_dev->lct_data.user_tid != 0xfff) {
-		osm_debug("skipping used device %03x\n", i2o_dev->lct_data.tid);
-		return -ENODEV;
-	}
-
-	if (i2o_device_claim(i2o_dev)) {
-		osm_warn("Unable to claim device. Installation aborted\n");
-		rc = -EFAULT;
-		goto exit;
-	}
-
-	i2o_blk_dev = i2o_block_device_alloc();
-	if (IS_ERR(i2o_blk_dev)) {
-		osm_err("could not alloc a new I2O block device");
-		rc = PTR_ERR(i2o_blk_dev);
-		goto claim_release;
-	}
-
-	i2o_blk_dev->i2o_dev = i2o_dev;
-	dev_set_drvdata(dev, i2o_blk_dev);
-
-	/* setup gendisk */
-	gd = i2o_blk_dev->gd;
-	gd->first_minor = unit << 4;
-	sprintf(gd->disk_name, "i2o/hd%c", 'a' + unit);
-	gd->driverfs_dev = &i2o_dev->device;
-
-	/* setup request queue */
-	queue = gd->queue;
-	queue->queuedata = i2o_blk_dev;
-
-	blk_queue_max_hw_sectors(queue, max_sectors);
-	blk_queue_max_segments(queue, i2o_sg_tablesize(c, body_size));
-
-	osm_debug("max sectors = %d\n", queue->max_sectors);
-	osm_debug("phys segments = %d\n", queue->max_phys_segments);
-	osm_debug("max hw segments = %d\n", queue->max_hw_segments);
-
-	/*
-	 *      Ask for the current media data. If that isn't supported
-	 *      then we ask for the device capacity data
-	 */
-	if (!i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) ||
-	    !i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
-		blk_queue_logical_block_size(queue, le32_to_cpu(blocksize));
-	} else
-		osm_warn("unable to get blocksize of %s\n", gd->disk_name);
-
-	if (!i2o_parm_field_get(i2o_dev, 0x0004, 0, &size, 8) ||
-	    !i2o_parm_field_get(i2o_dev, 0x0000, 4, &size, 8)) {
-		set_capacity(gd, le64_to_cpu(size) >> KERNEL_SECTOR_SHIFT);
-	} else
-		osm_warn("could not get size of %s\n", gd->disk_name);
-
-	if (!i2o_parm_field_get(i2o_dev, 0x0000, 2, &power, 2))
-		i2o_blk_dev->power = power;
-
-	i2o_event_register(i2o_dev, &i2o_block_driver, 0, 0xffffffff);
-
-	add_disk(gd);
-
-	unit++;
-
-	osm_info("device added (TID: %03x): %s\n", i2o_dev->lct_data.tid,
-		 i2o_blk_dev->gd->disk_name);
-
-	return 0;
-
-      claim_release:
-	i2o_device_claim_release(i2o_dev);
-
-      exit:
-	return rc;
-};
-
-/* Block OSM driver struct */
-static struct i2o_driver i2o_block_driver = {
-	.name = OSM_NAME,
-	.event = i2o_block_event,
-	.reply = i2o_block_reply,
-	.classes = i2o_block_class_id,
-	.driver = {
-		   .probe = i2o_block_probe,
-		   .remove = i2o_block_remove,
-		   },
-};
-
-/**
- *	i2o_block_init - Block OSM initialization function
- *
- *	Allocate the slab and mempool for request structs, registers i2o_block
- *	block device and finally register the Block OSM in the I2O core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_block_init(void)
-{
-	int rc;
-	int size;
-
-	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
-
-	/* Allocate request mempool and slab */
-	size = sizeof(struct i2o_block_request);
-	i2o_blk_req_pool.slab = kmem_cache_create("i2o_block_req", size, 0,
-						  SLAB_HWCACHE_ALIGN, NULL);
-	if (!i2o_blk_req_pool.slab) {
-		osm_err("can't init request slab\n");
-		rc = -ENOMEM;
-		goto exit;
-	}
-
-	i2o_blk_req_pool.pool =
-		mempool_create_slab_pool(I2O_BLOCK_REQ_MEMPOOL_SIZE,
-					 i2o_blk_req_pool.slab);
-	if (!i2o_blk_req_pool.pool) {
-		osm_err("can't init request mempool\n");
-		rc = -ENOMEM;
-		goto free_slab;
-	}
-
-	/* Register the block device interfaces */
-	rc = register_blkdev(I2O_MAJOR, "i2o_block");
-	if (rc) {
-		osm_err("unable to register block device\n");
-		goto free_mempool;
-	}
-#ifdef MODULE
-	osm_info("registered device at major %d\n", I2O_MAJOR);
-#endif
-
-	/* Register Block OSM into I2O core */
-	rc = i2o_driver_register(&i2o_block_driver);
-	if (rc) {
-		osm_err("Could not register Block driver\n");
-		goto unregister_blkdev;
-	}
-
-	return 0;
-
-      unregister_blkdev:
-	unregister_blkdev(I2O_MAJOR, "i2o_block");
-
-      free_mempool:
-	mempool_destroy(i2o_blk_req_pool.pool);
-
-      free_slab:
-	kmem_cache_destroy(i2o_blk_req_pool.slab);
-
-      exit:
-	return rc;
-};
-
-/**
- *	i2o_block_exit - Block OSM exit function
- *
- *	Unregisters Block OSM from I2O core, unregisters i2o_block block device
- *	and frees the mempool and slab.
- */
-static void __exit i2o_block_exit(void)
-{
-	/* Unregister I2O Block OSM from I2O core */
-	i2o_driver_unregister(&i2o_block_driver);
-
-	/* Unregister block device */
-	unregister_blkdev(I2O_MAJOR, "i2o_block");
-
-	/* Free request mempool and slab */
-	mempool_destroy(i2o_blk_req_pool.pool);
-	kmem_cache_destroy(i2o_blk_req_pool.slab);
-};
-
-MODULE_AUTHOR("Red Hat");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(OSM_DESCRIPTION);
-MODULE_VERSION(OSM_VERSION);
-
-module_init(i2o_block_init);
-module_exit(i2o_block_exit);
diff --git a/drivers/message/i2o/i2o_block.h b/drivers/message/i2o/i2o_block.h
deleted file mode 100644
index cf8873cbca3f..000000000000
--- a/drivers/message/i2o/i2o_block.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *	Block OSM structures/API
- *
- * 	Copyright (C) 1999-2002	Red Hat Software
- *
- *	Written by Alan Cox, Building Number Three Ltd
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	This program is distributed in the hope that it will be useful, but
- *	WITHOUT ANY WARRANTY; without even the implied warranty of
- *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *	General Public License for more details.
- *
- *	For the purpose of avoiding doubt the preferred form of the work
- *	for making modifications shall be a standards compliant form such
- *	gzipped tar and not one requiring a proprietary or patent encumbered
- *	tool to unpack.
- *
- *	Fixes/additions:
- *		Steve Ralston:
- *			Multiple device handling error fixes,
- *			Added a queue depth.
- *		Alan Cox:
- *			FC920 has an rmw bug. Dont or in the end marker.
- *			Removed queue walk, fixed for 64bitness.
- *			Rewrote much of the code over time
- *			Added indirect block lists
- *			Handle 64K limits on many controllers
- *			Don't use indirects on the Promise (breaks)
- *			Heavily chop down the queue depths
- *		Deepak Saxena:
- *			Independent queues per IOP
- *			Support for dynamic device creation/deletion
- *			Code cleanup
- *	    		Support for larger I/Os through merge* functions
- *			(taken from DAC960 driver)
- *		Boji T Kannanthanam:
- *			Set the I2O Block devices to be detected in increasing
- *			order of TIDs during boot.
- *			Search and set the I2O block device that we boot off
- *			from as the first device to be claimed (as /dev/i2o/hda)
- *			Properly attach/detach I2O gendisk structure from the
- *			system gendisk list. The I2O block devices now appear in
- *			/proc/partitions.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Minor bugfixes for 2.6.
- */
-
-#ifndef I2O_BLOCK_OSM_H
-#define I2O_BLOCK_OSM_H
-
-#define I2O_BLOCK_RETRY_TIME HZ/4
-#define I2O_BLOCK_MAX_OPEN_REQUESTS 50
-
-/* request queue sizes */
-#define I2O_BLOCK_REQ_MEMPOOL_SIZE		32
-
-#define KERNEL_SECTOR_SHIFT 9
-#define KERNEL_SECTOR_SIZE (1 << KERNEL_SECTOR_SHIFT)
-
-/* I2O Block OSM mempool struct */
-struct i2o_block_mempool {
-	struct kmem_cache *slab;
-	mempool_t *pool;
-};
-
-/* I2O Block device descriptor */
-struct i2o_block_device {
-	struct i2o_device *i2o_dev;	/* pointer to I2O device */
-	struct gendisk *gd;
-	spinlock_t lock;	/* queue lock */
-	struct list_head open_queue;	/* list of transferred, but unfinished
-					   requests */
-	unsigned int open_queue_depth;	/* number of requests in the queue */
-
-	int rcache;		/* read cache flags */
-	int wcache;		/* write cache flags */
-	int flags;
-	u16 power;		/* power state */
-	int media_change_flag;	/* media changed flag */
-};
-
-/* I2O Block device request */
-struct i2o_block_request {
-	struct list_head queue;
-	struct request *req;	/* corresponding request */
-	struct i2o_block_device *i2o_blk_dev;	/* I2O block device */
-	struct device *dev;	/* device used for DMA */
-	int sg_nents;		/* number of SG elements */
-	struct scatterlist sg_table[I2O_MAX_PHYS_SEGMENTS];	/* SG table */
-};
-
-/* I2O Block device delayed request */
-struct i2o_block_delayed_request {
-	struct delayed_work work;
-	struct request_queue *queue;
-};
-
-#endif
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
deleted file mode 100644
index 04bd3b6de401..000000000000
--- a/drivers/message/i2o/i2o_config.c
+++ /dev/null
@@ -1,1163 +0,0 @@
-/*
- * I2O Configuration Interface Driver
- *
- * (C) Copyright 1999-2002  Red Hat
- *
- * Written by Alan Cox, Building Number Three Ltd
- *
- * Fixes/additions:
- *	Deepak Saxena (04/20/1999):
- *		Added basic ioctl() support
- *	Deepak Saxena (06/07/1999):
- *		Added software download ioctl (still testing)
- *	Auvo Häkkinen (09/10/1999):
- *		Changes to i2o_cfg_reply(), ioctl_parms()
- *		Added ioct_validate()
- *	Taneli Vähäkangas (09/30/1999):
- *		Fixed ioctl_swdl()
- *	Taneli Vähäkangas (10/04/1999):
- *		Changed ioctl_swdl(), implemented ioctl_swul() and ioctl_swdel()
- *	Deepak Saxena (11/18/1999):
- *		Added event managmenet support
- *	Alan Cox <alan@lxorguk.ukuu.org.uk>:
- *		2.4 rewrite ported to 2.5
- *	Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *		Added pass-thru support for Adaptec's raidutils
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/miscdevice.h>
-#include <linux/mutex.h>
-#include <linux/compat.h>
-#include <linux/slab.h>
-
-#include <asm/uaccess.h>
-
-#include "core.h"
-
-#define SG_TABLESIZE		30
-
-static DEFINE_MUTEX(i2o_cfg_mutex);
-static long i2o_cfg_ioctl(struct file *, unsigned int, unsigned long);
-
-static spinlock_t i2o_config_lock;
-
-#define MODINC(x,y) ((x) = ((x) + 1) % (y))
-
-struct sg_simple_element {
-	u32 flag_count;
-	u32 addr_bus;
-};
-
-struct i2o_cfg_info {
-	struct file *fp;
-	struct fasync_struct *fasync;
-	struct i2o_evt_info event_q[I2O_EVT_Q_LEN];
-	u16 q_in;		// Queue head index
-	u16 q_out;		// Queue tail index
-	u16 q_len;		// Queue length
-	u16 q_lost;		// Number of lost events
-	ulong q_id;		// Event queue ID...used as tx_context
-	struct i2o_cfg_info *next;
-};
-static struct i2o_cfg_info *open_files = NULL;
-static ulong i2o_cfg_info_id = 0;
-
-static int i2o_cfg_getiops(unsigned long arg)
-{
-	struct i2o_controller *c;
-	u8 __user *user_iop_table = (void __user *)arg;
-	u8 tmp[MAX_I2O_CONTROLLERS];
-	int ret = 0;
-
-	memset(tmp, 0, MAX_I2O_CONTROLLERS);
-
-	list_for_each_entry(c, &i2o_controllers, list)
-	    tmp[c->unit] = 1;
-
-	if (copy_to_user(user_iop_table, tmp, MAX_I2O_CONTROLLERS))
-		ret = -EFAULT;
-
-	return ret;
-};
-
-static int i2o_cfg_gethrt(unsigned long arg)
-{
-	struct i2o_controller *c;
-	struct i2o_cmd_hrtlct __user *cmd = (struct i2o_cmd_hrtlct __user *)arg;
-	struct i2o_cmd_hrtlct kcmd;
-	i2o_hrt *hrt;
-	int len;
-	u32 reslen;
-	int ret = 0;
-
-	if (copy_from_user(&kcmd, cmd, sizeof(struct i2o_cmd_hrtlct)))
-		return -EFAULT;
-
-	if (get_user(reslen, kcmd.reslen) < 0)
-		return -EFAULT;
-
-	if (kcmd.resbuf == NULL)
-		return -EFAULT;
-
-	c = i2o_find_iop(kcmd.iop);
-	if (!c)
-		return -ENXIO;
-
-	hrt = (i2o_hrt *) c->hrt.virt;
-
-	len = 8 + ((hrt->entry_len * hrt->num_entries) << 2);
-
-	if (put_user(len, kcmd.reslen))
-		ret = -EFAULT;
-	else if (len > reslen)
-		ret = -ENOBUFS;
-	else if (copy_to_user(kcmd.resbuf, (void *)hrt, len))
-		ret = -EFAULT;
-
-	return ret;
-};
-
-static int i2o_cfg_getlct(unsigned long arg)
-{
-	struct i2o_controller *c;
-	struct i2o_cmd_hrtlct __user *cmd = (struct i2o_cmd_hrtlct __user *)arg;
-	struct i2o_cmd_hrtlct kcmd;
-	i2o_lct *lct;
-	int len;
-	int ret = 0;
-	u32 reslen;
-
-	if (copy_from_user(&kcmd, cmd, sizeof(struct i2o_cmd_hrtlct)))
-		return -EFAULT;
-
-	if (get_user(reslen, kcmd.reslen) < 0)
-		return -EFAULT;
-
-	if (kcmd.resbuf == NULL)
-		return -EFAULT;
-
-	c = i2o_find_iop(kcmd.iop);
-	if (!c)
-		return -ENXIO;
-
-	lct = (i2o_lct *) c->lct;
-
-	len = (unsigned int)lct->table_size << 2;
-	if (put_user(len, kcmd.reslen))
-		ret = -EFAULT;
-	else if (len > reslen)
-		ret = -ENOBUFS;
-	else if (copy_to_user(kcmd.resbuf, lct, len))
-		ret = -EFAULT;
-
-	return ret;
-};
-
-static int i2o_cfg_parms(unsigned long arg, unsigned int type)
-{
-	int ret = 0;
-	struct i2o_controller *c;
-	struct i2o_device *dev;
-	struct i2o_cmd_psetget __user *cmd =
-	    (struct i2o_cmd_psetget __user *)arg;
-	struct i2o_cmd_psetget kcmd;
-	u32 reslen;
-	u8 *ops;
-	u8 *res;
-	int len = 0;
-
-	u32 i2o_cmd = (type == I2OPARMGET ?
-		       I2O_CMD_UTIL_PARAMS_GET : I2O_CMD_UTIL_PARAMS_SET);
-
-	if (copy_from_user(&kcmd, cmd, sizeof(struct i2o_cmd_psetget)))
-		return -EFAULT;
-
-	if (get_user(reslen, kcmd.reslen))
-		return -EFAULT;
-
-	c = i2o_find_iop(kcmd.iop);
-	if (!c)
-		return -ENXIO;
-
-	dev = i2o_iop_find_device(c, kcmd.tid);
-	if (!dev)
-		return -ENXIO;
-
-	/*
-	 * Stop users being able to try and allocate arbitrary amounts
-	 * of DMA space. 64K is way more than sufficient for this.
-	 */
-	if (kcmd.oplen > 65536)
-		return -EMSGSIZE;
-
-	ops = memdup_user(kcmd.opbuf, kcmd.oplen);
-	if (IS_ERR(ops))
-		return PTR_ERR(ops);
-
-	/*
-	 * It's possible to have a _very_ large table
-	 * and that the user asks for all of it at once...
-	 */
-	res = kmalloc(65536, GFP_KERNEL);
-	if (!res) {
-		kfree(ops);
-		return -ENOMEM;
-	}
-
-	len = i2o_parm_issue(dev, i2o_cmd, ops, kcmd.oplen, res, 65536);
-	kfree(ops);
-
-	if (len < 0) {
-		kfree(res);
-		return -EAGAIN;
-	}
-
-	if (put_user(len, kcmd.reslen))
-		ret = -EFAULT;
-	else if (len > reslen)
-		ret = -ENOBUFS;
-	else if (copy_to_user(kcmd.resbuf, res, len))
-		ret = -EFAULT;
-
-	kfree(res);
-
-	return ret;
-};
-
-static int i2o_cfg_swdl(unsigned long arg)
-{
-	struct i2o_sw_xfer kxfer;
-	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
-	unsigned char maxfrag = 0, curfrag = 1;
-	struct i2o_dma buffer;
-	struct i2o_message *msg;
-	unsigned int status = 0, swlen = 0, fragsize = 8192;
-	struct i2o_controller *c;
-
-	if (copy_from_user(&kxfer, pxfer, sizeof(struct i2o_sw_xfer)))
-		return -EFAULT;
-
-	if (get_user(swlen, kxfer.swlen) < 0)
-		return -EFAULT;
-
-	if (get_user(maxfrag, kxfer.maxfrag) < 0)
-		return -EFAULT;
-
-	if (get_user(curfrag, kxfer.curfrag) < 0)
-		return -EFAULT;
-
-	if (curfrag == maxfrag)
-		fragsize = swlen - (maxfrag - 1) * 8192;
-
-	if (!kxfer.buf || !access_ok(VERIFY_READ, kxfer.buf, fragsize))
-		return -EFAULT;
-
-	c = i2o_find_iop(kxfer.iop);
-	if (!c)
-		return -ENXIO;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize)) {
-		i2o_msg_nop(c, msg);
-		return -ENOMEM;
-	}
-
-	if (__copy_from_user(buffer.virt, kxfer.buf, fragsize)) {
-		i2o_msg_nop(c, msg);
-		i2o_dma_free(&c->pdev->dev, &buffer);
-		return -EFAULT;
-	}
-
-	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SW_DOWNLOAD << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
-	msg->u.head[3] = cpu_to_le32(0);
-	msg->body[0] =
-	    cpu_to_le32((((u32) kxfer.flags) << 24) | (((u32) kxfer.
-							sw_type) << 16) |
-			(((u32) maxfrag) << 8) | (((u32) curfrag)));
-	msg->body[1] = cpu_to_le32(swlen);
-	msg->body[2] = cpu_to_le32(kxfer.sw_id);
-	msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
-	msg->body[4] = cpu_to_le32(buffer.phys);
-
-	osm_debug("swdl frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
-	status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
-
-	if (status != -ETIMEDOUT)
-		i2o_dma_free(&c->pdev->dev, &buffer);
-
-	if (status != I2O_POST_WAIT_OK) {
-		// it fails if you try and send frags out of order
-		// and for some yet unknown reasons too
-		osm_info("swdl failed, DetailedStatus = %d\n", status);
-		return status;
-	}
-
-	return 0;
-};
-
-static int i2o_cfg_swul(unsigned long arg)
-{
-	struct i2o_sw_xfer kxfer;
-	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
-	unsigned char maxfrag = 0, curfrag = 1;
-	struct i2o_dma buffer;
-	struct i2o_message *msg;
-	unsigned int status = 0, swlen = 0, fragsize = 8192;
-	struct i2o_controller *c;
-	int ret = 0;
-
-	if (copy_from_user(&kxfer, pxfer, sizeof(struct i2o_sw_xfer)))
-		return -EFAULT;
-
-	if (get_user(swlen, kxfer.swlen) < 0)
-		return -EFAULT;
-
-	if (get_user(maxfrag, kxfer.maxfrag) < 0)
-		return -EFAULT;
-
-	if (get_user(curfrag, kxfer.curfrag) < 0)
-		return -EFAULT;
-
-	if (curfrag == maxfrag)
-		fragsize = swlen - (maxfrag - 1) * 8192;
-
-	if (!kxfer.buf)
-		return -EFAULT;
-
-	c = i2o_find_iop(kxfer.iop);
-	if (!c)
-		return -ENXIO;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize)) {
-		i2o_msg_nop(c, msg);
-		return -ENOMEM;
-	}
-
-	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SW_UPLOAD << 24 | HOST_TID << 12 | ADAPTER_TID);
-	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
-	msg->u.head[3] = cpu_to_le32(0);
-	msg->body[0] =
-	    cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.
-			sw_type << 16 | (u32) maxfrag << 8 | (u32) curfrag);
-	msg->body[1] = cpu_to_le32(swlen);
-	msg->body[2] = cpu_to_le32(kxfer.sw_id);
-	msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
-	msg->body[4] = cpu_to_le32(buffer.phys);
-
-	osm_debug("swul frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
-	status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
-
-	if (status != I2O_POST_WAIT_OK) {
-		if (status != -ETIMEDOUT)
-			i2o_dma_free(&c->pdev->dev, &buffer);
-
-		osm_info("swul failed, DetailedStatus = %d\n", status);
-		return status;
-	}
-
-	if (copy_to_user(kxfer.buf, buffer.virt, fragsize))
-		ret = -EFAULT;
-
-	i2o_dma_free(&c->pdev->dev, &buffer);
-
-	return ret;
-}
-
-static int i2o_cfg_swdel(unsigned long arg)
-{
-	struct i2o_controller *c;
-	struct i2o_sw_xfer kxfer;
-	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
-	struct i2o_message *msg;
-	unsigned int swlen;
-	int token;
-
-	if (copy_from_user(&kxfer, pxfer, sizeof(struct i2o_sw_xfer)))
-		return -EFAULT;
-
-	if (get_user(swlen, kxfer.swlen) < 0)
-		return -EFAULT;
-
-	c = i2o_find_iop(kxfer.iop);
-	if (!c)
-		return -ENXIO;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(SEVEN_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SW_REMOVE << 24 | HOST_TID << 12 | ADAPTER_TID);
-	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
-	msg->u.head[3] = cpu_to_le32(0);
-	msg->body[0] =
-	    cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.sw_type << 16);
-	msg->body[1] = cpu_to_le32(swlen);
-	msg->body[2] = cpu_to_le32(kxfer.sw_id);
-
-	token = i2o_msg_post_wait(c, msg, 10);
-
-	if (token != I2O_POST_WAIT_OK) {
-		osm_info("swdel failed, DetailedStatus = %d\n", token);
-		return -ETIMEDOUT;
-	}
-
-	return 0;
-};
-
-static int i2o_cfg_validate(unsigned long arg)
-{
-	int token;
-	int iop = (int)arg;
-	struct i2o_message *msg;
-	struct i2o_controller *c;
-
-	c = i2o_find_iop(iop);
-	if (!c)
-		return -ENXIO;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_CONFIG_VALIDATE << 24 | HOST_TID << 12 | iop);
-	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
-	msg->u.head[3] = cpu_to_le32(0);
-
-	token = i2o_msg_post_wait(c, msg, 10);
-
-	if (token != I2O_POST_WAIT_OK) {
-		osm_info("Can't validate configuration, ErrorStatus = %d\n",
-			 token);
-		return -ETIMEDOUT;
-	}
-
-	return 0;
-};
-
-static int i2o_cfg_evt_reg(unsigned long arg, struct file *fp)
-{
-	struct i2o_message *msg;
-	struct i2o_evt_id __user *pdesc = (struct i2o_evt_id __user *)arg;
-	struct i2o_evt_id kdesc;
-	struct i2o_controller *c;
-	struct i2o_device *d;
-
-	if (copy_from_user(&kdesc, pdesc, sizeof(struct i2o_evt_id)))
-		return -EFAULT;
-
-	/* IOP exists? */
-	c = i2o_find_iop(kdesc.iop);
-	if (!c)
-		return -ENXIO;
-
-	/* Device exists? */
-	d = i2o_iop_find_device(c, kdesc.tid);
-	if (!d)
-		return -ENODEV;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 |
-			kdesc.tid);
-	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
-	msg->u.head[3] = cpu_to_le32(i2o_cntxt_list_add(c, fp->private_data));
-	msg->body[0] = cpu_to_le32(kdesc.evt_mask);
-
-	i2o_msg_post(c, msg);
-
-	return 0;
-}
-
-static int i2o_cfg_evt_get(unsigned long arg, struct file *fp)
-{
-	struct i2o_cfg_info *p = NULL;
-	struct i2o_evt_get __user *uget = (struct i2o_evt_get __user *)arg;
-	struct i2o_evt_get kget;
-	unsigned long flags;
-
-	for (p = open_files; p; p = p->next)
-		if (p->q_id == (ulong) fp->private_data)
-			break;
-
-	if (!p->q_len)
-		return -ENOENT;
-
-	memcpy(&kget.info, &p->event_q[p->q_out], sizeof(struct i2o_evt_info));
-	MODINC(p->q_out, I2O_EVT_Q_LEN);
-	spin_lock_irqsave(&i2o_config_lock, flags);
-	p->q_len--;
-	kget.pending = p->q_len;
-	kget.lost = p->q_lost;
-	spin_unlock_irqrestore(&i2o_config_lock, flags);
-
-	if (copy_to_user(uget, &kget, sizeof(struct i2o_evt_get)))
-		return -EFAULT;
-	return 0;
-}
-
-#ifdef CONFIG_COMPAT
-static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
-			      unsigned long arg)
-{
-	struct i2o_cmd_passthru32 __user *cmd;
-	struct i2o_controller *c;
-	u32 __user *user_msg;
-	u32 *reply = NULL;
-	u32 __user *user_reply = NULL;
-	u32 size = 0;
-	u32 reply_size = 0;
-	u32 rcode = 0;
-	struct i2o_dma sg_list[SG_TABLESIZE];
-	u32 sg_offset = 0;
-	u32 sg_count = 0;
-	u32 i = 0;
-	u32 sg_index = 0;
-	i2o_status_block *sb;
-	struct i2o_message *msg;
-	unsigned int iop;
-
-	cmd = (struct i2o_cmd_passthru32 __user *)arg;
-
-	if (get_user(iop, &cmd->iop) || get_user(i, &cmd->msg))
-		return -EFAULT;
-
-	user_msg = compat_ptr(i);
-
-	c = i2o_find_iop(iop);
-	if (!c) {
-		osm_debug("controller %d not found\n", iop);
-		return -ENXIO;
-	}
-
-	sb = c->status_block.virt;
-
-	if (get_user(size, &user_msg[0])) {
-		osm_warn("unable to get size!\n");
-		return -EFAULT;
-	}
-	size = size >> 16;
-
-	if (size > sb->inbound_frame_size) {
-		osm_warn("size of message > inbound_frame_size");
-		return -EFAULT;
-	}
-
-	user_reply = &user_msg[size];
-
-	size <<= 2;		// Convert to bytes
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	rcode = -EFAULT;
-	/* Copy in the user's I2O command */
-	if (copy_from_user(msg, user_msg, size)) {
-		osm_warn("unable to copy user message\n");
-		goto out;
-	}
-	i2o_dump_message(msg);
-
-	if (get_user(reply_size, &user_reply[0]) < 0)
-		goto out;
-
-	reply_size >>= 16;
-	reply_size <<= 2;
-
-	rcode = -ENOMEM;
-	reply = kzalloc(reply_size, GFP_KERNEL);
-	if (!reply) {
-		printk(KERN_WARNING "%s: Could not allocate reply buffer\n",
-		       c->name);
-		goto out;
-	}
-
-	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
-
-	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
-	if (sg_offset) {
-		struct sg_simple_element *sg;
-
-		if (sg_offset * 4 >= size) {
-			rcode = -EFAULT;
-			goto cleanup;
-		}
-		// TODO 64bit fix
-		sg = (struct sg_simple_element *)((&msg->u.head[0]) +
-						  sg_offset);
-		sg_count =
-		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
-		if (sg_count > SG_TABLESIZE) {
-			printk(KERN_DEBUG "%s:IOCTL SG List too large (%u)\n",
-			       c->name, sg_count);
-			rcode = -EINVAL;
-			goto cleanup;
-		}
-
-		for (i = 0; i < sg_count; i++) {
-			int sg_size;
-			struct i2o_dma *p;
-
-			if (!(sg[i].flag_count & 0x10000000
-			      /*I2O_SGL_FLAGS_SIMPLE_ADDRESS_ELEMENT */ )) {
-				printk(KERN_DEBUG
-				       "%s:Bad SG element %d - not simple (%x)\n",
-				       c->name, i, sg[i].flag_count);
-				rcode = -EINVAL;
-				goto cleanup;
-			}
-			sg_size = sg[i].flag_count & 0xffffff;
-			p = &(sg_list[sg_index]);
-			/* Allocate memory for the transfer */
-			if (i2o_dma_alloc(&c->pdev->dev, p, sg_size)) {
-				printk(KERN_DEBUG
-				       "%s: Could not allocate SG buffer - size = %d buffer number %d of %d\n",
-				       c->name, sg_size, i, sg_count);
-				rcode = -ENOMEM;
-				goto sg_list_cleanup;
-			}
-			sg_index++;
-			/* Copy in the user's SG buffer if necessary */
-			if (sg[i].
-			    flag_count & 0x04000000 /*I2O_SGL_FLAGS_DIR */ ) {
-				// TODO 64bit fix
-				if (copy_from_user
-				    (p->virt,
-				     (void __user *)(unsigned long)sg[i].
-				     addr_bus, sg_size)) {
-					printk(KERN_DEBUG
-					       "%s: Could not copy SG buf %d FROM user\n",
-					       c->name, i);
-					rcode = -EFAULT;
-					goto sg_list_cleanup;
-				}
-			}
-			//TODO 64bit fix
-			sg[i].addr_bus = (u32) p->phys;
-		}
-	}
-
-	rcode = i2o_msg_post_wait(c, msg, 60);
-	msg = NULL;
-	if (rcode) {
-		reply[4] = ((u32) rcode) << 24;
-		goto sg_list_cleanup;
-	}
-
-	if (sg_offset) {
-		u32 rmsg[I2O_OUTBOUND_MSG_FRAME_SIZE];
-		/* Copy back the Scatter Gather buffers back to user space */
-		u32 j;
-		// TODO 64bit fix
-		struct sg_simple_element *sg;
-		int sg_size;
-
-		// re-acquire the original message to handle correctly the sg copy operation
-		memset(&rmsg, 0, I2O_OUTBOUND_MSG_FRAME_SIZE * 4);
-		// get user msg size in u32s
-		if (get_user(size, &user_msg[0])) {
-			rcode = -EFAULT;
-			goto sg_list_cleanup;
-		}
-		size = size >> 16;
-		size *= 4;
-		if (size > sizeof(rmsg)) {
-			rcode = -EINVAL;
-			goto sg_list_cleanup;
-		}
-
-		/* Copy in the user's I2O command */
-		if (copy_from_user(rmsg, user_msg, size)) {
-			rcode = -EFAULT;
-			goto sg_list_cleanup;
-		}
-		sg_count =
-		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
-
-		// TODO 64bit fix
-		sg = (struct sg_simple_element *)(rmsg + sg_offset);
-		for (j = 0; j < sg_count; j++) {
-			/* Copy out the SG list to user's buffer if necessary */
-			if (!
-			    (sg[j].
-			     flag_count & 0x4000000 /*I2O_SGL_FLAGS_DIR */ )) {
-				sg_size = sg[j].flag_count & 0xffffff;
-				// TODO 64bit fix
-				if (copy_to_user
-				    ((void __user *)(u64) sg[j].addr_bus,
-				     sg_list[j].virt, sg_size)) {
-					printk(KERN_WARNING
-					       "%s: Could not copy %p TO user %x\n",
-					       c->name, sg_list[j].virt,
-					       sg[j].addr_bus);
-					rcode = -EFAULT;
-					goto sg_list_cleanup;
-				}
-			}
-		}
-	}
-
-sg_list_cleanup:
-	/* Copy back the reply to user space */
-	if (reply_size) {
-		// we wrote our own values for context - now restore the user supplied ones
-		if (copy_from_user(reply + 2, user_msg + 2, sizeof(u32) * 2)) {
-			printk(KERN_WARNING
-			       "%s: Could not copy message context FROM user\n",
-			       c->name);
-			rcode = -EFAULT;
-		}
-		if (copy_to_user(user_reply, reply, reply_size)) {
-			printk(KERN_WARNING
-			       "%s: Could not copy reply TO user\n", c->name);
-			rcode = -EFAULT;
-		}
-	}
-	for (i = 0; i < sg_index; i++)
-		i2o_dma_free(&c->pdev->dev, &sg_list[i]);
-
-cleanup:
-	kfree(reply);
-out:
-	if (msg)
-		i2o_msg_nop(c, msg);
-	return rcode;
-}
-
-static long i2o_cfg_compat_ioctl(struct file *file, unsigned cmd,
-				 unsigned long arg)
-{
-	int ret;
-	switch (cmd) {
-	case I2OGETIOPS:
-		ret = i2o_cfg_ioctl(file, cmd, arg);
-		break;
-	case I2OPASSTHRU32:
-		mutex_lock(&i2o_cfg_mutex);
-		ret = i2o_cfg_passthru32(file, cmd, arg);
-		mutex_unlock(&i2o_cfg_mutex);
-		break;
-	default:
-		ret = -ENOIOCTLCMD;
-		break;
-	}
-	return ret;
-}
-
-#endif
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-static int i2o_cfg_passthru(unsigned long arg)
-{
-	struct i2o_cmd_passthru __user *cmd =
-	    (struct i2o_cmd_passthru __user *)arg;
-	struct i2o_controller *c;
-	u32 __user *user_msg;
-	u32 *reply = NULL;
-	u32 __user *user_reply = NULL;
-	u32 size = 0;
-	u32 reply_size = 0;
-	u32 rcode = 0;
-	struct i2o_dma sg_list[SG_TABLESIZE];
-	u32 sg_offset = 0;
-	u32 sg_count = 0;
-	int sg_index = 0;
-	u32 i = 0;
-	i2o_status_block *sb;
-	struct i2o_message *msg;
-	unsigned int iop;
-
-	if (get_user(iop, &cmd->iop) || get_user(user_msg, &cmd->msg))
-		return -EFAULT;
-
-	c = i2o_find_iop(iop);
-	if (!c) {
-		osm_warn("controller %d not found\n", iop);
-		return -ENXIO;
-	}
-
-	sb = c->status_block.virt;
-
-	if (get_user(size, &user_msg[0]))
-		return -EFAULT;
-	size = size >> 16;
-
-	if (size > sb->inbound_frame_size) {
-		osm_warn("size of message > inbound_frame_size");
-		return -EFAULT;
-	}
-
-	user_reply = &user_msg[size];
-
-	size <<= 2;		// Convert to bytes
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	rcode = -EFAULT;
-	/* Copy in the user's I2O command */
-	if (copy_from_user(msg, user_msg, size))
-		goto out;
-
-	if (get_user(reply_size, &user_reply[0]) < 0)
-		goto out;
-
-	reply_size >>= 16;
-	reply_size <<= 2;
-
-	reply = kzalloc(reply_size, GFP_KERNEL);
-	if (!reply) {
-		printk(KERN_WARNING "%s: Could not allocate reply buffer\n",
-		       c->name);
-		rcode = -ENOMEM;
-		goto out;
-	}
-
-	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
-
-	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
-	if (sg_offset) {
-		struct sg_simple_element *sg;
-		struct i2o_dma *p;
-
-		if (sg_offset * 4 >= size) {
-			rcode = -EFAULT;
-			goto cleanup;
-		}
-		// TODO 64bit fix
-		sg = (struct sg_simple_element *)((&msg->u.head[0]) +
-						  sg_offset);
-		sg_count =
-		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
-		if (sg_count > SG_TABLESIZE) {
-			printk(KERN_DEBUG "%s:IOCTL SG List too large (%u)\n",
-			       c->name, sg_count);
-			rcode = -EINVAL;
-			goto cleanup;
-		}
-
-		for (i = 0; i < sg_count; i++) {
-			int sg_size;
-
-			if (!(sg[i].flag_count & 0x10000000
-			      /*I2O_SGL_FLAGS_SIMPLE_ADDRESS_ELEMENT */ )) {
-				printk(KERN_DEBUG
-				       "%s:Bad SG element %d - not simple (%x)\n",
-				       c->name, i, sg[i].flag_count);
-				rcode = -EINVAL;
-				goto sg_list_cleanup;
-			}
-			sg_size = sg[i].flag_count & 0xffffff;
-			p = &(sg_list[sg_index]);
-			if (i2o_dma_alloc(&c->pdev->dev, p, sg_size)) {
-			/* Allocate memory for the transfer */
-				printk(KERN_DEBUG
-				       "%s: Could not allocate SG buffer - size = %d buffer number %d of %d\n",
-				       c->name, sg_size, i, sg_count);
-				rcode = -ENOMEM;
-				goto sg_list_cleanup;
-			}
-			sg_index++;
-			/* Copy in the user's SG buffer if necessary */
-			if (sg[i].
-			    flag_count & 0x04000000 /*I2O_SGL_FLAGS_DIR */ ) {
-				// TODO 64bit fix
-				if (copy_from_user
-				    (p->virt, (void __user *)sg[i].addr_bus,
-				     sg_size)) {
-					printk(KERN_DEBUG
-					       "%s: Could not copy SG buf %d FROM user\n",
-					       c->name, i);
-					rcode = -EFAULT;
-					goto sg_list_cleanup;
-				}
-			}
-			sg[i].addr_bus = p->phys;
-		}
-	}
-
-	rcode = i2o_msg_post_wait(c, msg, 60);
-	msg = NULL;
-	if (rcode) {
-		reply[4] = ((u32) rcode) << 24;
-		goto sg_list_cleanup;
-	}
-
-	if (sg_offset) {
-		u32 rmsg[I2O_OUTBOUND_MSG_FRAME_SIZE];
-		/* Copy back the Scatter Gather buffers back to user space */
-		u32 j;
-		// TODO 64bit fix
-		struct sg_simple_element *sg;
-		int sg_size;
-
-		// re-acquire the original message to handle correctly the sg copy operation
-		memset(&rmsg, 0, I2O_OUTBOUND_MSG_FRAME_SIZE * 4);
-		// get user msg size in u32s
-		if (get_user(size, &user_msg[0])) {
-			rcode = -EFAULT;
-			goto sg_list_cleanup;
-		}
-		size = size >> 16;
-		size *= 4;
-		if (size > sizeof(rmsg)) {
-			rcode = -EFAULT;
-			goto sg_list_cleanup;
-		}
-
-		/* Copy in the user's I2O command */
-		if (copy_from_user(rmsg, user_msg, size)) {
-			rcode = -EFAULT;
-			goto sg_list_cleanup;
-		}
-		sg_count =
-		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
-
-		// TODO 64bit fix
-		sg = (struct sg_simple_element *)(rmsg + sg_offset);
-		for (j = 0; j < sg_count; j++) {
-			/* Copy out the SG list to user's buffer if necessary */
-			if (!
-			    (sg[j].
-			     flag_count & 0x4000000 /*I2O_SGL_FLAGS_DIR */ )) {
-				sg_size = sg[j].flag_count & 0xffffff;
-				// TODO 64bit fix
-				if (copy_to_user
-				    ((void __user *)sg[j].addr_bus, sg_list[j].virt,
-				     sg_size)) {
-					printk(KERN_WARNING
-					       "%s: Could not copy %p TO user %x\n",
-					       c->name, sg_list[j].virt,
-					       sg[j].addr_bus);
-					rcode = -EFAULT;
-					goto sg_list_cleanup;
-				}
-			}
-		}
-	}
-
-sg_list_cleanup:
-	/* Copy back the reply to user space */
-	if (reply_size) {
-		// we wrote our own values for context - now restore the user supplied ones
-		if (copy_from_user(reply + 2, user_msg + 2, sizeof(u32) * 2)) {
-			printk(KERN_WARNING
-			       "%s: Could not copy message context FROM user\n",
-			       c->name);
-			rcode = -EFAULT;
-		}
-		if (copy_to_user(user_reply, reply, reply_size)) {
-			printk(KERN_WARNING
-			       "%s: Could not copy reply TO user\n", c->name);
-			rcode = -EFAULT;
-		}
-	}
-
-	for (i = 0; i < sg_index; i++)
-		i2o_dma_free(&c->pdev->dev, &sg_list[i]);
-
-cleanup:
-	kfree(reply);
-out:
-	if (msg)
-		i2o_msg_nop(c, msg);
-	return rcode;
-}
-#endif
-
-/*
- * IOCTL Handler
- */
-static long i2o_cfg_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
-{
-	int ret;
-
-	mutex_lock(&i2o_cfg_mutex);
-	switch (cmd) {
-	case I2OGETIOPS:
-		ret = i2o_cfg_getiops(arg);
-		break;
-
-	case I2OHRTGET:
-		ret = i2o_cfg_gethrt(arg);
-		break;
-
-	case I2OLCTGET:
-		ret = i2o_cfg_getlct(arg);
-		break;
-
-	case I2OPARMSET:
-		ret = i2o_cfg_parms(arg, I2OPARMSET);
-		break;
-
-	case I2OPARMGET:
-		ret = i2o_cfg_parms(arg, I2OPARMGET);
-		break;
-
-	case I2OSWDL:
-		ret = i2o_cfg_swdl(arg);
-		break;
-
-	case I2OSWUL:
-		ret = i2o_cfg_swul(arg);
-		break;
-
-	case I2OSWDEL:
-		ret = i2o_cfg_swdel(arg);
-		break;
-
-	case I2OVALIDATE:
-		ret = i2o_cfg_validate(arg);
-		break;
-
-	case I2OEVTREG:
-		ret = i2o_cfg_evt_reg(arg, fp);
-		break;
-
-	case I2OEVTGET:
-		ret = i2o_cfg_evt_get(arg, fp);
-		break;
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-	case I2OPASSTHRU:
-		ret = i2o_cfg_passthru(arg);
-		break;
-#endif
-
-	default:
-		osm_debug("unknown ioctl called!\n");
-		ret = -EINVAL;
-	}
-	mutex_unlock(&i2o_cfg_mutex);
-	return ret;
-}
-
-static int cfg_open(struct inode *inode, struct file *file)
-{
-	struct i2o_cfg_info *tmp = kmalloc(sizeof(struct i2o_cfg_info),
-					   GFP_KERNEL);
-	unsigned long flags;
-
-	if (!tmp)
-		return -ENOMEM;
-
-	mutex_lock(&i2o_cfg_mutex);
-	file->private_data = (void *)(i2o_cfg_info_id++);
-	tmp->fp = file;
-	tmp->fasync = NULL;
-	tmp->q_id = (ulong) file->private_data;
-	tmp->q_len = 0;
-	tmp->q_in = 0;
-	tmp->q_out = 0;
-	tmp->q_lost = 0;
-	tmp->next = open_files;
-
-	spin_lock_irqsave(&i2o_config_lock, flags);
-	open_files = tmp;
-	spin_unlock_irqrestore(&i2o_config_lock, flags);
-	mutex_unlock(&i2o_cfg_mutex);
-
-	return 0;
-}
-
-static int cfg_fasync(int fd, struct file *fp, int on)
-{
-	ulong id = (ulong) fp->private_data;
-	struct i2o_cfg_info *p;
-	int ret = -EBADF;
-
-	mutex_lock(&i2o_cfg_mutex);
-	for (p = open_files; p; p = p->next)
-		if (p->q_id == id)
-			break;
-
-	if (p)
-		ret = fasync_helper(fd, fp, on, &p->fasync);
-	mutex_unlock(&i2o_cfg_mutex);
-	return ret;
-}
-
-static int cfg_release(struct inode *inode, struct file *file)
-{
-	ulong id = (ulong) file->private_data;
-	struct i2o_cfg_info *p, **q;
-	unsigned long flags;
-
-	mutex_lock(&i2o_cfg_mutex);
-	spin_lock_irqsave(&i2o_config_lock, flags);
-	for (q = &open_files; (p = *q) != NULL; q = &p->next) {
-		if (p->q_id == id) {
-			*q = p->next;
-			kfree(p);
-			break;
-		}
-	}
-	spin_unlock_irqrestore(&i2o_config_lock, flags);
-	mutex_unlock(&i2o_cfg_mutex);
-
-	return 0;
-}
-
-static const struct file_operations config_fops = {
-	.owner = THIS_MODULE,
-	.llseek = no_llseek,
-	.unlocked_ioctl = i2o_cfg_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl = i2o_cfg_compat_ioctl,
-#endif
-	.open = cfg_open,
-	.release = cfg_release,
-	.fasync = cfg_fasync,
-};
-
-static struct miscdevice i2o_miscdev = {
-	I2O_MINOR,
-	"i2octl",
-	&config_fops
-};
-
-static int __init i2o_config_old_init(void)
-{
-	spin_lock_init(&i2o_config_lock);
-
-	if (misc_register(&i2o_miscdev) < 0) {
-		osm_err("can't register device.\n");
-		return -EBUSY;
-	}
-
-	return 0;
-}
-
-static void i2o_config_old_exit(void)
-{
-	misc_deregister(&i2o_miscdev);
-}
-
-MODULE_AUTHOR("Red Hat Software");
diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
deleted file mode 100644
index b7d87cd227a9..000000000000
--- a/drivers/message/i2o/i2o_proc.c
+++ /dev/null
@@ -1,2045 +0,0 @@
-/*
- *	procfs handler for Linux I2O subsystem
- *
- *	(c) Copyright 1999	Deepak Saxena
- *
- *	Originally written by Deepak Saxena(deepak@plexity.net)
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	This is an initial test release. The code is based on the design of the
- *	ide procfs system (drivers/block/ide-proc.c). Some code taken from
- *	i2o-core module by Alan Cox.
- *
- *	DISCLAIMER: This code is still under development/test and may cause
- *	your system to behave unpredictably.  Use at your own discretion.
- *
- *
- *	Fixes/additions:
- *		Juha Sievänen (Juha.Sievanen@cs.Helsinki.FI),
- *		Auvo Häkkinen (Auvo.Hakkinen@cs.Helsinki.FI)
- *		University of Helsinki, Department of Computer Science
- *			LAN entries
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>
- *			Changes for new I2O API
- */
-
-#define OSM_NAME	"proc-osm"
-#define OSM_VERSION	"1.316"
-#define OSM_DESCRIPTION	"I2O ProcFS OSM"
-
-#define I2O_MAX_MODULES 4
-// FIXME!
-#define FMT_U64_HEX "0x%08x%08x"
-#define U64_VAL(pu64) *((u32*)(pu64)+1), *((u32*)(pu64))
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/i2o.h>
-#include <linux/slab.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/spinlock.h>
-#include <linux/workqueue.h>
-
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <asm/byteorder.h>
-
-/* Structure used to define /proc entries */
-typedef struct _i2o_proc_entry_t {
-	char *name;		/* entry name */
-	umode_t mode;		/* mode */
-	const struct file_operations *fops;	/* open function */
-} i2o_proc_entry;
-
-/* global I2O /proc/i2o entry */
-static struct proc_dir_entry *i2o_proc_dir_root;
-
-/* proc OSM driver struct */
-static struct i2o_driver i2o_proc_driver = {
-	.name = OSM_NAME,
-};
-
-static int print_serial_number(struct seq_file *seq, u8 * serialno, int max_len)
-{
-	int i;
-
-	/* 19990419 -sralston
-	 *      The I2O v1.5 (and v2.0 so far) "official specification"
-	 *      got serial numbers WRONG!
-	 *      Apparently, and despite what Section 3.4.4 says and
-	 *      Figure 3-35 shows (pg 3-39 in the pdf doc),
-	 *      the convention / consensus seems to be:
-	 *        + First byte is SNFormat
-	 *        + Second byte is SNLen (but only if SNFormat==7 (?))
-	 *        + (v2.0) SCSI+BS may use IEEE Registered (64 or 128 bit) format
-	 */
-	switch (serialno[0]) {
-	case I2O_SNFORMAT_BINARY:	/* Binary */
-		seq_printf(seq, "0x");
-		for (i = 0; i < serialno[1]; i++) {
-			seq_printf(seq, "%02X", serialno[2 + i]);
-		}
-		break;
-
-	case I2O_SNFORMAT_ASCII:	/* ASCII */
-		if (serialno[1] < ' ') {	/* printable or SNLen? */
-			/* sanity */
-			max_len =
-			    (max_len < serialno[1]) ? max_len : serialno[1];
-			serialno[1 + max_len] = '\0';
-
-			/* just print it */
-			seq_printf(seq, "%s", &serialno[2]);
-		} else {
-			/* print chars for specified length */
-			for (i = 0; i < serialno[1]; i++) {
-				seq_printf(seq, "%c", serialno[2 + i]);
-			}
-		}
-		break;
-
-	case I2O_SNFORMAT_UNICODE:	/* UNICODE */
-		seq_printf(seq, "UNICODE Format.  Can't Display\n");
-		break;
-
-	case I2O_SNFORMAT_LAN48_MAC:	/* LAN-48 MAC Address */
-		seq_printf(seq, "LAN-48 MAC address @ %pM", &serialno[2]);
-		break;
-
-	case I2O_SNFORMAT_WAN:	/* WAN MAC Address */
-		/* FIXME: Figure out what a WAN access address looks like?? */
-		seq_printf(seq, "WAN Access Address");
-		break;
-
-/* plus new in v2.0 */
-	case I2O_SNFORMAT_LAN64_MAC:	/* LAN-64 MAC Address */
-		/* FIXME: Figure out what a LAN-64 address really looks like?? */
-		seq_printf(seq,
-			   "LAN-64 MAC address @ [?:%02X:%02X:?] %pM",
-			   serialno[8], serialno[9], &serialno[2]);
-		break;
-
-	case I2O_SNFORMAT_DDM:	/* I2O DDM */
-		seq_printf(seq,
-			   "DDM: Tid=%03Xh, Rsvd=%04Xh, OrgId=%04Xh",
-			   *(u16 *) & serialno[2],
-			   *(u16 *) & serialno[4], *(u16 *) & serialno[6]);
-		break;
-
-	case I2O_SNFORMAT_IEEE_REG64:	/* IEEE Registered (64-bit) */
-	case I2O_SNFORMAT_IEEE_REG128:	/* IEEE Registered (128-bit) */
-		/* FIXME: Figure if this is even close?? */
-		seq_printf(seq,
-			   "IEEE NodeName(hi,lo)=(%08Xh:%08Xh), PortName(hi,lo)=(%08Xh:%08Xh)\n",
-			   *(u32 *) & serialno[2],
-			   *(u32 *) & serialno[6],
-			   *(u32 *) & serialno[10], *(u32 *) & serialno[14]);
-		break;
-
-	case I2O_SNFORMAT_UNKNOWN:	/* Unknown 0    */
-	case I2O_SNFORMAT_UNKNOWN2:	/* Unknown 0xff */
-	default:
-		seq_printf(seq, "Unknown data format (0x%02x)", serialno[0]);
-		break;
-	}
-
-	return 0;
-}
-
-/**
- *	i2o_get_class_name - 	do i2o class name lookup
- *	@class: class number
- *
- *	Return a descriptive string for an i2o class.
- */
-static const char *i2o_get_class_name(int class)
-{
-	int idx = 16;
-	static char *i2o_class_name[] = {
-		"Executive",
-		"Device Driver Module",
-		"Block Device",
-		"Tape Device",
-		"LAN Interface",
-		"WAN Interface",
-		"Fibre Channel Port",
-		"Fibre Channel Device",
-		"SCSI Device",
-		"ATE Port",
-		"ATE Device",
-		"Floppy Controller",
-		"Floppy Device",
-		"Secondary Bus Port",
-		"Peer Transport Agent",
-		"Peer Transport",
-		"Unknown"
-	};
-
-	switch (class & 0xfff) {
-	case I2O_CLASS_EXECUTIVE:
-		idx = 0;
-		break;
-	case I2O_CLASS_DDM:
-		idx = 1;
-		break;
-	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-		idx = 2;
-		break;
-	case I2O_CLASS_SEQUENTIAL_STORAGE:
-		idx = 3;
-		break;
-	case I2O_CLASS_LAN:
-		idx = 4;
-		break;
-	case I2O_CLASS_WAN:
-		idx = 5;
-		break;
-	case I2O_CLASS_FIBRE_CHANNEL_PORT:
-		idx = 6;
-		break;
-	case I2O_CLASS_FIBRE_CHANNEL_PERIPHERAL:
-		idx = 7;
-		break;
-	case I2O_CLASS_SCSI_PERIPHERAL:
-		idx = 8;
-		break;
-	case I2O_CLASS_ATE_PORT:
-		idx = 9;
-		break;
-	case I2O_CLASS_ATE_PERIPHERAL:
-		idx = 10;
-		break;
-	case I2O_CLASS_FLOPPY_CONTROLLER:
-		idx = 11;
-		break;
-	case I2O_CLASS_FLOPPY_DEVICE:
-		idx = 12;
-		break;
-	case I2O_CLASS_BUS_ADAPTER:
-		idx = 13;
-		break;
-	case I2O_CLASS_PEER_TRANSPORT_AGENT:
-		idx = 14;
-		break;
-	case I2O_CLASS_PEER_TRANSPORT:
-		idx = 15;
-		break;
-	}
-
-	return i2o_class_name[idx];
-}
-
-#define SCSI_TABLE_SIZE	13
-static char *scsi_devices[] = {
-	"Direct-Access Read/Write",
-	"Sequential-Access Storage",
-	"Printer",
-	"Processor",
-	"WORM Device",
-	"CD-ROM Device",
-	"Scanner Device",
-	"Optical Memory Device",
-	"Medium Changer Device",
-	"Communications Device",
-	"Graphics Art Pre-Press Device",
-	"Graphics Art Pre-Press Device",
-	"Array Controller Device"
-};
-
-static char *chtostr(char *tmp, u8 *chars, int n)
-{
-	tmp[0] = 0;
-	return strncat(tmp, (char *)chars, n);
-}
-
-static int i2o_report_query_status(struct seq_file *seq, int block_status,
-				   char *group)
-{
-	switch (block_status) {
-	case -ETIMEDOUT:
-		return seq_printf(seq, "Timeout reading group %s.\n", group);
-	case -ENOMEM:
-		return seq_printf(seq, "No free memory to read the table.\n");
-	case -I2O_PARAMS_STATUS_INVALID_GROUP_ID:
-		return seq_printf(seq, "Group %s not supported.\n", group);
-	default:
-		return seq_printf(seq,
-				  "Error reading group %s. BlockStatus 0x%02X\n",
-				  group, -block_status);
-	}
-}
-
-static char *bus_strings[] = {
-	"Local Bus",
-	"ISA",
-	"EISA",
-	"PCI",
-	"PCMCIA",
-	"NUBUS",
-	"CARDBUS"
-};
-
-static int i2o_seq_show_hrt(struct seq_file *seq, void *v)
-{
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	i2o_hrt *hrt = (i2o_hrt *) c->hrt.virt;
-	u32 bus;
-	int i;
-
-	if (hrt->hrt_version) {
-		seq_printf(seq,
-			   "HRT table for controller is too new a version.\n");
-		return 0;
-	}
-
-	seq_printf(seq, "HRT has %d entries of %d bytes each.\n",
-		   hrt->num_entries, hrt->entry_len << 2);
-
-	for (i = 0; i < hrt->num_entries; i++) {
-		seq_printf(seq, "Entry %d:\n", i);
-		seq_printf(seq, "   Adapter ID: %0#10x\n",
-			   hrt->hrt_entry[i].adapter_id);
-		seq_printf(seq, "   Controlling tid: %0#6x\n",
-			   hrt->hrt_entry[i].parent_tid);
-
-		if (hrt->hrt_entry[i].bus_type != 0x80) {
-			bus = hrt->hrt_entry[i].bus_type;
-			seq_printf(seq, "   %s Information\n",
-				   bus_strings[bus]);
-
-			switch (bus) {
-			case I2O_BUS_LOCAL:
-				seq_printf(seq, "     IOBase: %0#6x,",
-					   hrt->hrt_entry[i].bus.local_bus.
-					   LbBaseIOPort);
-				seq_printf(seq, " MemoryBase: %0#10x\n",
-					   hrt->hrt_entry[i].bus.local_bus.
-					   LbBaseMemoryAddress);
-				break;
-
-			case I2O_BUS_ISA:
-				seq_printf(seq, "     IOBase: %0#6x,",
-					   hrt->hrt_entry[i].bus.isa_bus.
-					   IsaBaseIOPort);
-				seq_printf(seq, " MemoryBase: %0#10x,",
-					   hrt->hrt_entry[i].bus.isa_bus.
-					   IsaBaseMemoryAddress);
-				seq_printf(seq, " CSN: %0#4x,",
-					   hrt->hrt_entry[i].bus.isa_bus.CSN);
-				break;
-
-			case I2O_BUS_EISA:
-				seq_printf(seq, "     IOBase: %0#6x,",
-					   hrt->hrt_entry[i].bus.eisa_bus.
-					   EisaBaseIOPort);
-				seq_printf(seq, " MemoryBase: %0#10x,",
-					   hrt->hrt_entry[i].bus.eisa_bus.
-					   EisaBaseMemoryAddress);
-				seq_printf(seq, " Slot: %0#4x,",
-					   hrt->hrt_entry[i].bus.eisa_bus.
-					   EisaSlotNumber);
-				break;
-
-			case I2O_BUS_PCI:
-				seq_printf(seq, "     Bus: %0#4x",
-					   hrt->hrt_entry[i].bus.pci_bus.
-					   PciBusNumber);
-				seq_printf(seq, " Dev: %0#4x",
-					   hrt->hrt_entry[i].bus.pci_bus.
-					   PciDeviceNumber);
-				seq_printf(seq, " Func: %0#4x",
-					   hrt->hrt_entry[i].bus.pci_bus.
-					   PciFunctionNumber);
-				seq_printf(seq, " Vendor: %0#6x",
-					   hrt->hrt_entry[i].bus.pci_bus.
-					   PciVendorID);
-				seq_printf(seq, " Device: %0#6x\n",
-					   hrt->hrt_entry[i].bus.pci_bus.
-					   PciDeviceID);
-				break;
-
-			default:
-				seq_printf(seq, "      Unsupported Bus Type\n");
-			}
-		} else
-			seq_printf(seq, "   Unknown Bus Type\n");
-	}
-
-	return 0;
-}
-
-static int i2o_seq_show_lct(struct seq_file *seq, void *v)
-{
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	i2o_lct *lct = (i2o_lct *) c->lct;
-	int entries;
-	int i;
-
-#define BUS_TABLE_SIZE 3
-	static char *bus_ports[] = {
-		"Generic Bus",
-		"SCSI Bus",
-		"Fibre Channel Bus"
-	};
-
-	entries = (lct->table_size - 3) / 9;
-
-	seq_printf(seq, "LCT contains %d %s\n", entries,
-		   entries == 1 ? "entry" : "entries");
-	if (lct->boot_tid)
-		seq_printf(seq, "Boot Device @ ID %d\n", lct->boot_tid);
-
-	seq_printf(seq, "Current Change Indicator: %#10x\n", lct->change_ind);
-
-	for (i = 0; i < entries; i++) {
-		seq_printf(seq, "Entry %d\n", i);
-		seq_printf(seq, "  Class, SubClass  : %s",
-			   i2o_get_class_name(lct->lct_entry[i].class_id));
-
-		/*
-		 *      Classes which we'll print subclass info for
-		 */
-		switch (lct->lct_entry[i].class_id & 0xFFF) {
-		case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-			switch (lct->lct_entry[i].sub_class) {
-			case 0x00:
-				seq_printf(seq, ", Direct-Access Read/Write");
-				break;
-
-			case 0x04:
-				seq_printf(seq, ", WORM Drive");
-				break;
-
-			case 0x05:
-				seq_printf(seq, ", CD-ROM Drive");
-				break;
-
-			case 0x07:
-				seq_printf(seq, ", Optical Memory Device");
-				break;
-
-			default:
-				seq_printf(seq, ", Unknown (0x%02x)",
-					   lct->lct_entry[i].sub_class);
-				break;
-			}
-			break;
-
-		case I2O_CLASS_LAN:
-			switch (lct->lct_entry[i].sub_class & 0xFF) {
-			case 0x30:
-				seq_printf(seq, ", Ethernet");
-				break;
-
-			case 0x40:
-				seq_printf(seq, ", 100base VG");
-				break;
-
-			case 0x50:
-				seq_printf(seq, ", IEEE 802.5/Token-Ring");
-				break;
-
-			case 0x60:
-				seq_printf(seq, ", ANSI X3T9.5 FDDI");
-				break;
-
-			case 0x70:
-				seq_printf(seq, ", Fibre Channel");
-				break;
-
-			default:
-				seq_printf(seq, ", Unknown Sub-Class (0x%02x)",
-					   lct->lct_entry[i].sub_class & 0xFF);
-				break;
-			}
-			break;
-
-		case I2O_CLASS_SCSI_PERIPHERAL:
-			if (lct->lct_entry[i].sub_class < SCSI_TABLE_SIZE)
-				seq_printf(seq, ", %s",
-					   scsi_devices[lct->lct_entry[i].
-							sub_class]);
-			else
-				seq_printf(seq, ", Unknown Device Type");
-			break;
-
-		case I2O_CLASS_BUS_ADAPTER:
-			if (lct->lct_entry[i].sub_class < BUS_TABLE_SIZE)
-				seq_printf(seq, ", %s",
-					   bus_ports[lct->lct_entry[i].
-						     sub_class]);
-			else
-				seq_printf(seq, ", Unknown Bus Type");
-			break;
-		}
-		seq_printf(seq, "\n");
-
-		seq_printf(seq, "  Local TID        : 0x%03x\n",
-			   lct->lct_entry[i].tid);
-		seq_printf(seq, "  User TID         : 0x%03x\n",
-			   lct->lct_entry[i].user_tid);
-		seq_printf(seq, "  Parent TID       : 0x%03x\n",
-			   lct->lct_entry[i].parent_tid);
-		seq_printf(seq, "  Identity Tag     : 0x%x%x%x%x%x%x%x%x\n",
-			   lct->lct_entry[i].identity_tag[0],
-			   lct->lct_entry[i].identity_tag[1],
-			   lct->lct_entry[i].identity_tag[2],
-			   lct->lct_entry[i].identity_tag[3],
-			   lct->lct_entry[i].identity_tag[4],
-			   lct->lct_entry[i].identity_tag[5],
-			   lct->lct_entry[i].identity_tag[6],
-			   lct->lct_entry[i].identity_tag[7]);
-		seq_printf(seq, "  Change Indicator : %0#10x\n",
-			   lct->lct_entry[i].change_ind);
-		seq_printf(seq, "  Event Capab Mask : %0#10x\n",
-			   lct->lct_entry[i].device_flags);
-	}
-
-	return 0;
-}
-
-static int i2o_seq_show_status(struct seq_file *seq, void *v)
-{
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	char prodstr[25];
-	int version;
-	i2o_status_block *sb = c->status_block.virt;
-
-	i2o_status_get(c);	// reread the status block
-
-	seq_printf(seq, "Organization ID        : %0#6x\n", sb->org_id);
-
-	version = sb->i2o_version;
-
-/* FIXME for Spec 2.0
-	if (version == 0x02) {
-		seq_printf(seq, "Lowest I2O version supported: ");
-		switch(workspace[2]) {
-			case 0x00:
-				seq_printf(seq, "1.0\n");
-				break;
-			case 0x01:
-				seq_printf(seq, "1.5\n");
-				break;
-			case 0x02:
-				seq_printf(seq, "2.0\n");
-				break;
-		}
-
-		seq_printf(seq, "Highest I2O version supported: ");
-		switch(workspace[3]) {
-			case 0x00:
-				seq_printf(seq, "1.0\n");
-				break;
-			case 0x01:
-				seq_printf(seq, "1.5\n");
-				break;
-			case 0x02:
-				seq_printf(seq, "2.0\n");
-				break;
-		}
-	}
-*/
-	seq_printf(seq, "IOP ID                 : %0#5x\n", sb->iop_id);
-	seq_printf(seq, "Host Unit ID           : %0#6x\n", sb->host_unit_id);
-	seq_printf(seq, "Segment Number         : %0#5x\n", sb->segment_number);
-
-	seq_printf(seq, "I2O version            : ");
-	switch (version) {
-	case 0x00:
-		seq_printf(seq, "1.0\n");
-		break;
-	case 0x01:
-		seq_printf(seq, "1.5\n");
-		break;
-	case 0x02:
-		seq_printf(seq, "2.0\n");
-		break;
-	default:
-		seq_printf(seq, "Unknown version\n");
-	}
-
-	seq_printf(seq, "IOP State              : ");
-	switch (sb->iop_state) {
-	case 0x01:
-		seq_printf(seq, "INIT\n");
-		break;
-
-	case 0x02:
-		seq_printf(seq, "RESET\n");
-		break;
-
-	case 0x04:
-		seq_printf(seq, "HOLD\n");
-		break;
-
-	case 0x05:
-		seq_printf(seq, "READY\n");
-		break;
-
-	case 0x08:
-		seq_printf(seq, "OPERATIONAL\n");
-		break;
-
-	case 0x10:
-		seq_printf(seq, "FAILED\n");
-		break;
-
-	case 0x11:
-		seq_printf(seq, "FAULTED\n");
-		break;
-
-	default:
-		seq_printf(seq, "Unknown\n");
-		break;
-	}
-
-	seq_printf(seq, "Messenger Type         : ");
-	switch (sb->msg_type) {
-	case 0x00:
-		seq_printf(seq, "Memory mapped\n");
-		break;
-	case 0x01:
-		seq_printf(seq, "Memory mapped only\n");
-		break;
-	case 0x02:
-		seq_printf(seq, "Remote only\n");
-		break;
-	case 0x03:
-		seq_printf(seq, "Memory mapped and remote\n");
-		break;
-	default:
-		seq_printf(seq, "Unknown\n");
-	}
-
-	seq_printf(seq, "Inbound Frame Size     : %d bytes\n",
-		   sb->inbound_frame_size << 2);
-	seq_printf(seq, "Max Inbound Frames     : %d\n",
-		   sb->max_inbound_frames);
-	seq_printf(seq, "Current Inbound Frames : %d\n",
-		   sb->cur_inbound_frames);
-	seq_printf(seq, "Max Outbound Frames    : %d\n",
-		   sb->max_outbound_frames);
-
-	/* Spec doesn't say if NULL terminated or not... */
-	memcpy(prodstr, sb->product_id, 24);
-	prodstr[24] = '\0';
-	seq_printf(seq, "Product ID             : %s\n", prodstr);
-	seq_printf(seq, "Expected LCT Size      : %d bytes\n",
-		   sb->expected_lct_size);
-
-	seq_printf(seq, "IOP Capabilities\n");
-	seq_printf(seq, "    Context Field Size Support : ");
-	switch (sb->iop_capabilities & 0x0000003) {
-	case 0:
-		seq_printf(seq, "Supports only 32-bit context fields\n");
-		break;
-	case 1:
-		seq_printf(seq, "Supports only 64-bit context fields\n");
-		break;
-	case 2:
-		seq_printf(seq, "Supports 32-bit and 64-bit context fields, "
-			   "but not concurrently\n");
-		break;
-	case 3:
-		seq_printf(seq, "Supports 32-bit and 64-bit context fields "
-			   "concurrently\n");
-		break;
-	default:
-		seq_printf(seq, "0x%08x\n", sb->iop_capabilities);
-	}
-	seq_printf(seq, "    Current Context Field Size : ");
-	switch (sb->iop_capabilities & 0x0000000C) {
-	case 0:
-		seq_printf(seq, "not configured\n");
-		break;
-	case 4:
-		seq_printf(seq, "Supports only 32-bit context fields\n");
-		break;
-	case 8:
-		seq_printf(seq, "Supports only 64-bit context fields\n");
-		break;
-	case 12:
-		seq_printf(seq, "Supports both 32-bit or 64-bit context fields "
-			   "concurrently\n");
-		break;
-	default:
-		seq_printf(seq, "\n");
-	}
-	seq_printf(seq, "    Inbound Peer Support       : %s\n",
-		   (sb->
-		    iop_capabilities & 0x00000010) ? "Supported" :
-		   "Not supported");
-	seq_printf(seq, "    Outbound Peer Support      : %s\n",
-		   (sb->
-		    iop_capabilities & 0x00000020) ? "Supported" :
-		   "Not supported");
-	seq_printf(seq, "    Peer to Peer Support       : %s\n",
-		   (sb->
-		    iop_capabilities & 0x00000040) ? "Supported" :
-		   "Not supported");
-
-	seq_printf(seq, "Desired private memory size   : %d kB\n",
-		   sb->desired_mem_size >> 10);
-	seq_printf(seq, "Allocated private memory size : %d kB\n",
-		   sb->current_mem_size >> 10);
-	seq_printf(seq, "Private memory base address   : %0#10x\n",
-		   sb->current_mem_base);
-	seq_printf(seq, "Desired private I/O size      : %d kB\n",
-		   sb->desired_io_size >> 10);
-	seq_printf(seq, "Allocated private I/O size    : %d kB\n",
-		   sb->current_io_size >> 10);
-	seq_printf(seq, "Private I/O base address      : %0#10x\n",
-		   sb->current_io_base);
-
-	return 0;
-}
-
-static int i2o_seq_show_hw(struct seq_file *seq, void *v)
-{
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	static u32 work32[5];
-	static u8 *work8 = (u8 *) work32;
-	static u16 *work16 = (u16 *) work32;
-	int token;
-	u32 hwcap;
-
-	static char *cpu_table[] = {
-		"Intel 80960 series",
-		"AMD2900 series",
-		"Motorola 68000 series",
-		"ARM series",
-		"MIPS series",
-		"Sparc series",
-		"PowerPC series",
-		"Intel x86 series"
-	};
-
-	token =
-	    i2o_parm_field_get(c->exec, 0x0000, -1, &work32, sizeof(work32));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0x0000 IOP Hardware");
-		return 0;
-	}
-
-	seq_printf(seq, "I2O Vendor ID    : %0#6x\n", work16[0]);
-	seq_printf(seq, "Product ID       : %0#6x\n", work16[1]);
-	seq_printf(seq, "CPU              : ");
-	if (work8[16] > 8)
-		seq_printf(seq, "Unknown\n");
-	else
-		seq_printf(seq, "%s\n", cpu_table[work8[16]]);
-	/* Anyone using ProcessorVersion? */
-
-	seq_printf(seq, "RAM              : %dkB\n", work32[1] >> 10);
-	seq_printf(seq, "Non-Volatile Mem : %dkB\n", work32[2] >> 10);
-
-	hwcap = work32[3];
-	seq_printf(seq, "Capabilities : 0x%08x\n", hwcap);
-	seq_printf(seq, "   [%s] Self booting\n",
-		   (hwcap & 0x00000001) ? "+" : "-");
-	seq_printf(seq, "   [%s] Upgradable IRTOS\n",
-		   (hwcap & 0x00000002) ? "+" : "-");
-	seq_printf(seq, "   [%s] Supports downloading DDMs\n",
-		   (hwcap & 0x00000004) ? "+" : "-");
-	seq_printf(seq, "   [%s] Supports installing DDMs\n",
-		   (hwcap & 0x00000008) ? "+" : "-");
-	seq_printf(seq, "   [%s] Battery-backed RAM\n",
-		   (hwcap & 0x00000010) ? "+" : "-");
-
-	return 0;
-}
-
-/* Executive group 0003h - Executing DDM List (table) */
-static int i2o_seq_show_ddm_table(struct seq_file *seq, void *v)
-{
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	int token;
-	int i;
-
-	typedef struct _i2o_exec_execute_ddm_table {
-		u16 ddm_tid;
-		u8 module_type;
-		u8 reserved;
-		u16 i2o_vendor_id;
-		u16 module_id;
-		u8 module_name_version[28];
-		u32 data_size;
-		u32 code_size;
-	} i2o_exec_execute_ddm_table;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		i2o_exec_execute_ddm_table ddm_table[I2O_MAX_MODULES];
-	} *result;
-
-	i2o_exec_execute_ddm_table ddm_table;
-	char tmp[28 + 1];
-
-	result = kmalloc(sizeof(*result), GFP_KERNEL);
-	if (!result)
-		return -ENOMEM;
-
-	token = i2o_parm_table_get(c->exec, I2O_PARAMS_TABLE_GET, 0x0003, -1,
-				   NULL, 0, result, sizeof(*result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0x0003 Executing DDM List");
-		goto out;
-	}
-
-	seq_printf(seq,
-		   "Tid   Module_type     Vendor Mod_id  Module_name             Vrs  Data_size Code_size\n");
-	ddm_table = result->ddm_table[0];
-
-	for (i = 0; i < result->row_count; ddm_table = result->ddm_table[++i]) {
-		seq_printf(seq, "0x%03x ", ddm_table.ddm_tid & 0xFFF);
-
-		switch (ddm_table.module_type) {
-		case 0x01:
-			seq_printf(seq, "Downloaded DDM  ");
-			break;
-		case 0x22:
-			seq_printf(seq, "Embedded DDM    ");
-			break;
-		default:
-			seq_printf(seq, "                ");
-		}
-
-		seq_printf(seq, "%-#7x", ddm_table.i2o_vendor_id);
-		seq_printf(seq, "%-#8x", ddm_table.module_id);
-		seq_printf(seq, "%-29s",
-			   chtostr(tmp, ddm_table.module_name_version, 28));
-		seq_printf(seq, "%9d  ", ddm_table.data_size);
-		seq_printf(seq, "%8d", ddm_table.code_size);
-
-		seq_printf(seq, "\n");
-	}
-      out:
-	kfree(result);
-	return 0;
-}
-
-/* Executive group 0004h - Driver Store (scalar) */
-static int i2o_seq_show_driver_store(struct seq_file *seq, void *v)
-{
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	u32 work32[8];
-	int token;
-
-	token =
-	    i2o_parm_field_get(c->exec, 0x0004, -1, &work32, sizeof(work32));
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0x0004 Driver Store");
-		return 0;
-	}
-
-	seq_printf(seq, "Module limit  : %d\n"
-		   "Module count  : %d\n"
-		   "Current space : %d kB\n"
-		   "Free space    : %d kB\n",
-		   work32[0], work32[1], work32[2] >> 10, work32[3] >> 10);
-
-	return 0;
-}
-
-/* Executive group 0005h - Driver Store Table (table) */
-static int i2o_seq_show_drivers_stored(struct seq_file *seq, void *v)
-{
-	typedef struct _i2o_driver_store {
-		u16 stored_ddm_index;
-		u8 module_type;
-		u8 reserved;
-		u16 i2o_vendor_id;
-		u16 module_id;
-		u8 module_name_version[28];
-		u8 date[8];
-		u32 module_size;
-		u32 mpb_size;
-		u32 module_flags;
-	} i2o_driver_store_table;
-
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	int token;
-	int i;
-
-	typedef struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		i2o_driver_store_table dst[I2O_MAX_MODULES];
-	} i2o_driver_result_table;
-
-	i2o_driver_result_table *result;
-	i2o_driver_store_table *dst;
-	char tmp[28 + 1];
-
-	result = kmalloc(sizeof(i2o_driver_result_table), GFP_KERNEL);
-	if (result == NULL)
-		return -ENOMEM;
-
-	token = i2o_parm_table_get(c->exec, I2O_PARAMS_TABLE_GET, 0x0005, -1,
-				   NULL, 0, result, sizeof(*result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0x0005 DRIVER STORE TABLE");
-		kfree(result);
-		return 0;
-	}
-
-	seq_printf(seq,
-		   "#  Module_type     Vendor Mod_id  Module_name             Vrs"
-		   "Date     Mod_size Par_size Flags\n");
-	for (i = 0, dst = &result->dst[0]; i < result->row_count;
-	     dst = &result->dst[++i]) {
-		seq_printf(seq, "%-3d", dst->stored_ddm_index);
-		switch (dst->module_type) {
-		case 0x01:
-			seq_printf(seq, "Downloaded DDM  ");
-			break;
-		case 0x22:
-			seq_printf(seq, "Embedded DDM    ");
-			break;
-		default:
-			seq_printf(seq, "                ");
-		}
-
-		seq_printf(seq, "%-#7x", dst->i2o_vendor_id);
-		seq_printf(seq, "%-#8x", dst->module_id);
-		seq_printf(seq, "%-29s",
-			   chtostr(tmp, dst->module_name_version, 28));
-		seq_printf(seq, "%-9s", chtostr(tmp, dst->date, 8));
-		seq_printf(seq, "%8d ", dst->module_size);
-		seq_printf(seq, "%8d ", dst->mpb_size);
-		seq_printf(seq, "0x%04x", dst->module_flags);
-		seq_printf(seq, "\n");
-	}
-
-	kfree(result);
-	return 0;
-}
-
-/* Generic group F000h - Params Descriptor (table) */
-static int i2o_seq_show_groups(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-	int i;
-	u8 properties;
-
-	typedef struct _i2o_group_info {
-		u16 group_number;
-		u16 field_count;
-		u16 row_count;
-		u8 properties;
-		u8 reserved;
-	} i2o_group_info;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		i2o_group_info group[256];
-	} *result;
-
-	result = kmalloc(sizeof(*result), GFP_KERNEL);
-	if (!result)
-		return -ENOMEM;
-
-	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF000, -1, NULL, 0,
-				   result, sizeof(*result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0xF000 Params Descriptor");
-		goto out;
-	}
-
-	seq_printf(seq,
-		   "#  Group   FieldCount RowCount Type   Add Del Clear\n");
-
-	for (i = 0; i < result->row_count; i++) {
-		seq_printf(seq, "%-3d", i);
-		seq_printf(seq, "0x%04X ", result->group[i].group_number);
-		seq_printf(seq, "%10d ", result->group[i].field_count);
-		seq_printf(seq, "%8d ", result->group[i].row_count);
-
-		properties = result->group[i].properties;
-		if (properties & 0x1)
-			seq_printf(seq, "Table  ");
-		else
-			seq_printf(seq, "Scalar ");
-		if (properties & 0x2)
-			seq_printf(seq, " + ");
-		else
-			seq_printf(seq, " - ");
-		if (properties & 0x4)
-			seq_printf(seq, "  + ");
-		else
-			seq_printf(seq, "  - ");
-		if (properties & 0x8)
-			seq_printf(seq, "  + ");
-		else
-			seq_printf(seq, "  - ");
-
-		seq_printf(seq, "\n");
-	}
-
-	if (result->more_flag)
-		seq_printf(seq, "There is more...\n");
-      out:
-	kfree(result);
-	return 0;
-}
-
-/* Generic group F001h - Physical Device Table (table) */
-static int i2o_seq_show_phys_device(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-	int i;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		u32 adapter_id[64];
-	} result;
-
-	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF001, -1, NULL, 0,
-				   &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0xF001 Physical Device Table");
-		return 0;
-	}
-
-	if (result.row_count)
-		seq_printf(seq, "#  AdapterId\n");
-
-	for (i = 0; i < result.row_count; i++) {
-		seq_printf(seq, "%-2d", i);
-		seq_printf(seq, "%#7x\n", result.adapter_id[i]);
-	}
-
-	if (result.more_flag)
-		seq_printf(seq, "There is more...\n");
-
-	return 0;
-}
-
-/* Generic group F002h - Claimed Table (table) */
-static int i2o_seq_show_claimed(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-	int i;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		u16 claimed_tid[64];
-	} result;
-
-	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF002, -1, NULL, 0,
-				   &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0xF002 Claimed Table");
-		return 0;
-	}
-
-	if (result.row_count)
-		seq_printf(seq, "#  ClaimedTid\n");
-
-	for (i = 0; i < result.row_count; i++) {
-		seq_printf(seq, "%-2d", i);
-		seq_printf(seq, "%#7x\n", result.claimed_tid[i]);
-	}
-
-	if (result.more_flag)
-		seq_printf(seq, "There is more...\n");
-
-	return 0;
-}
-
-/* Generic group F003h - User Table (table) */
-static int i2o_seq_show_users(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-	int i;
-
-	typedef struct _i2o_user_table {
-		u16 instance;
-		u16 user_tid;
-		u8 claim_type;
-		u8 reserved1;
-		u16 reserved2;
-	} i2o_user_table;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		i2o_user_table user[64];
-	} *result;
-
-	result = kmalloc(sizeof(*result), GFP_KERNEL);
-	if (!result)
-		return -ENOMEM;
-
-	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF003, -1, NULL, 0,
-				   result, sizeof(*result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0xF003 User Table");
-		goto out;
-	}
-
-	seq_printf(seq, "#  Instance UserTid ClaimType\n");
-
-	for (i = 0; i < result->row_count; i++) {
-		seq_printf(seq, "%-3d", i);
-		seq_printf(seq, "%#8x ", result->user[i].instance);
-		seq_printf(seq, "%#7x ", result->user[i].user_tid);
-		seq_printf(seq, "%#9x\n", result->user[i].claim_type);
-	}
-
-	if (result->more_flag)
-		seq_printf(seq, "There is more...\n");
-      out:
-	kfree(result);
-	return 0;
-}
-
-/* Generic group F005h - Private message extensions (table) (optional) */
-static int i2o_seq_show_priv_msgs(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-	int i;
-
-	typedef struct _i2o_private {
-		u16 ext_instance;
-		u16 organization_id;
-		u16 x_function_code;
-	} i2o_private;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		i2o_private extension[64];
-	} result;
-
-	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF000, -1, NULL, 0,
-				   &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0xF005 Private Message Extensions (optional)");
-		return 0;
-	}
-
-	seq_printf(seq, "Instance#  OrgId  FunctionCode\n");
-
-	for (i = 0; i < result.row_count; i++) {
-		seq_printf(seq, "%0#9x ", result.extension[i].ext_instance);
-		seq_printf(seq, "%0#6x ", result.extension[i].organization_id);
-		seq_printf(seq, "%0#6x", result.extension[i].x_function_code);
-
-		seq_printf(seq, "\n");
-	}
-
-	if (result.more_flag)
-		seq_printf(seq, "There is more...\n");
-
-	return 0;
-}
-
-/* Generic group F006h - Authorized User Table (table) */
-static int i2o_seq_show_authorized_users(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-	int i;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		u32 alternate_tid[64];
-	} result;
-
-	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF006, -1, NULL, 0,
-				   &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0xF006 Autohorized User Table");
-		return 0;
-	}
-
-	if (result.row_count)
-		seq_printf(seq, "#  AlternateTid\n");
-
-	for (i = 0; i < result.row_count; i++) {
-		seq_printf(seq, "%-2d", i);
-		seq_printf(seq, "%#7x ", result.alternate_tid[i]);
-	}
-
-	if (result.more_flag)
-		seq_printf(seq, "There is more...\n");
-
-	return 0;
-}
-
-/* Generic group F100h - Device Identity (scalar) */
-static int i2o_seq_show_dev_identity(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	static u32 work32[128];	// allow for "stuff" + up to 256 byte (max) serial number
-	// == (allow) 512d bytes (max)
-	static u16 *work16 = (u16 *) work32;
-	int token;
-	char tmp[16 + 1];
-
-	token = i2o_parm_field_get(d, 0xF100, -1, &work32, sizeof(work32));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0xF100 Device Identity");
-		return 0;
-	}
-
-	seq_printf(seq, "Device Class  : %s\n", i2o_get_class_name(work16[0]));
-	seq_printf(seq, "Owner TID     : %0#5x\n", work16[2]);
-	seq_printf(seq, "Parent TID    : %0#5x\n", work16[3]);
-	seq_printf(seq, "Vendor info   : %s\n",
-		   chtostr(tmp, (u8 *) (work32 + 2), 16));
-	seq_printf(seq, "Product info  : %s\n",
-		   chtostr(tmp, (u8 *) (work32 + 6), 16));
-	seq_printf(seq, "Description   : %s\n",
-		   chtostr(tmp, (u8 *) (work32 + 10), 16));
-	seq_printf(seq, "Product rev.  : %s\n",
-		   chtostr(tmp, (u8 *) (work32 + 14), 8));
-
-	seq_printf(seq, "Serial number : ");
-	print_serial_number(seq, (u8 *) (work32 + 16),
-			    /* allow for SNLen plus
-			     * possible trailing '\0'
-			     */
-			    sizeof(work32) - (16 * sizeof(u32)) - 2);
-	seq_printf(seq, "\n");
-
-	return 0;
-}
-
-static int i2o_seq_show_dev_name(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-
-	seq_printf(seq, "%s\n", dev_name(&d->device));
-
-	return 0;
-}
-
-/* Generic group F101h - DDM Identity (scalar) */
-static int i2o_seq_show_ddm_identity(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-
-	struct {
-		u16 ddm_tid;
-		u8 module_name[24];
-		u8 module_rev[8];
-		u8 sn_format;
-		u8 serial_number[12];
-		u8 pad[256];	// allow up to 256 byte (max) serial number
-	} result;
-
-	char tmp[24 + 1];
-
-	token = i2o_parm_field_get(d, 0xF101, -1, &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0xF101 DDM Identity");
-		return 0;
-	}
-
-	seq_printf(seq, "Registering DDM TID : 0x%03x\n", result.ddm_tid);
-	seq_printf(seq, "Module name         : %s\n",
-		   chtostr(tmp, result.module_name, 24));
-	seq_printf(seq, "Module revision     : %s\n",
-		   chtostr(tmp, result.module_rev, 8));
-
-	seq_printf(seq, "Serial number       : ");
-	print_serial_number(seq, result.serial_number, sizeof(result) - 36);
-	/* allow for SNLen plus possible trailing '\0' */
-
-	seq_printf(seq, "\n");
-
-	return 0;
-}
-
-/* Generic group F102h - User Information (scalar) */
-static int i2o_seq_show_uinfo(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-
-	struct {
-		u8 device_name[64];
-		u8 service_name[64];
-		u8 physical_location[64];
-		u8 instance_number[4];
-	} result;
-
-	char tmp[64 + 1];
-
-	token = i2o_parm_field_get(d, 0xF102, -1, &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0xF102 User Information");
-		return 0;
-	}
-
-	seq_printf(seq, "Device name     : %s\n",
-		   chtostr(tmp, result.device_name, 64));
-	seq_printf(seq, "Service name    : %s\n",
-		   chtostr(tmp, result.service_name, 64));
-	seq_printf(seq, "Physical name   : %s\n",
-		   chtostr(tmp, result.physical_location, 64));
-	seq_printf(seq, "Instance number : %s\n",
-		   chtostr(tmp, result.instance_number, 4));
-
-	return 0;
-}
-
-/* Generic group F103h - SGL Operating Limits (scalar) */
-static int i2o_seq_show_sgl_limits(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	static u32 work32[12];
-	static u16 *work16 = (u16 *) work32;
-	static u8 *work8 = (u8 *) work32;
-	int token;
-
-	token = i2o_parm_field_get(d, 0xF103, -1, &work32, sizeof(work32));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0xF103 SGL Operating Limits");
-		return 0;
-	}
-
-	seq_printf(seq, "SGL chain size        : %d\n", work32[0]);
-	seq_printf(seq, "Max SGL chain size    : %d\n", work32[1]);
-	seq_printf(seq, "SGL chain size target : %d\n", work32[2]);
-	seq_printf(seq, "SGL frag count        : %d\n", work16[6]);
-	seq_printf(seq, "Max SGL frag count    : %d\n", work16[7]);
-	seq_printf(seq, "SGL frag count target : %d\n", work16[8]);
-
-/* FIXME
-	if (d->i2oversion == 0x02)
-	{
-*/
-	seq_printf(seq, "SGL data alignment    : %d\n", work16[8]);
-	seq_printf(seq, "SGL addr limit        : %d\n", work8[20]);
-	seq_printf(seq, "SGL addr sizes supported : ");
-	if (work8[21] & 0x01)
-		seq_printf(seq, "32 bit ");
-	if (work8[21] & 0x02)
-		seq_printf(seq, "64 bit ");
-	if (work8[21] & 0x04)
-		seq_printf(seq, "96 bit ");
-	if (work8[21] & 0x08)
-		seq_printf(seq, "128 bit ");
-	seq_printf(seq, "\n");
-/*
-	}
-*/
-
-	return 0;
-}
-
-/* Generic group F200h - Sensors (scalar) */
-static int i2o_seq_show_sensors(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-
-	struct {
-		u16 sensor_instance;
-		u8 component;
-		u16 component_instance;
-		u8 sensor_class;
-		u8 sensor_type;
-		u8 scaling_exponent;
-		u32 actual_reading;
-		u32 minimum_reading;
-		u32 low2lowcat_treshold;
-		u32 lowcat2low_treshold;
-		u32 lowwarn2low_treshold;
-		u32 low2lowwarn_treshold;
-		u32 norm2lowwarn_treshold;
-		u32 lowwarn2norm_treshold;
-		u32 nominal_reading;
-		u32 hiwarn2norm_treshold;
-		u32 norm2hiwarn_treshold;
-		u32 high2hiwarn_treshold;
-		u32 hiwarn2high_treshold;
-		u32 hicat2high_treshold;
-		u32 hi2hicat_treshold;
-		u32 maximum_reading;
-		u8 sensor_state;
-		u16 event_enable;
-	} result;
-
-	token = i2o_parm_field_get(d, 0xF200, -1, &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0xF200 Sensors (optional)");
-		return 0;
-	}
-
-	seq_printf(seq, "Sensor instance       : %d\n", result.sensor_instance);
-
-	seq_printf(seq, "Component             : %d = ", result.component);
-	switch (result.component) {
-	case 0:
-		seq_printf(seq, "Other");
-		break;
-	case 1:
-		seq_printf(seq, "Planar logic Board");
-		break;
-	case 2:
-		seq_printf(seq, "CPU");
-		break;
-	case 3:
-		seq_printf(seq, "Chassis");
-		break;
-	case 4:
-		seq_printf(seq, "Power Supply");
-		break;
-	case 5:
-		seq_printf(seq, "Storage");
-		break;
-	case 6:
-		seq_printf(seq, "External");
-		break;
-	}
-	seq_printf(seq, "\n");
-
-	seq_printf(seq, "Component instance    : %d\n",
-		   result.component_instance);
-	seq_printf(seq, "Sensor class          : %s\n",
-		   result.sensor_class ? "Analog" : "Digital");
-
-	seq_printf(seq, "Sensor type           : %d = ", result.sensor_type);
-	switch (result.sensor_type) {
-	case 0:
-		seq_printf(seq, "Other\n");
-		break;
-	case 1:
-		seq_printf(seq, "Thermal\n");
-		break;
-	case 2:
-		seq_printf(seq, "DC voltage (DC volts)\n");
-		break;
-	case 3:
-		seq_printf(seq, "AC voltage (AC volts)\n");
-		break;
-	case 4:
-		seq_printf(seq, "DC current (DC amps)\n");
-		break;
-	case 5:
-		seq_printf(seq, "AC current (AC volts)\n");
-		break;
-	case 6:
-		seq_printf(seq, "Door open\n");
-		break;
-	case 7:
-		seq_printf(seq, "Fan operational\n");
-		break;
-	}
-
-	seq_printf(seq, "Scaling exponent      : %d\n",
-		   result.scaling_exponent);
-	seq_printf(seq, "Actual reading        : %d\n", result.actual_reading);
-	seq_printf(seq, "Minimum reading       : %d\n", result.minimum_reading);
-	seq_printf(seq, "Low2LowCat treshold   : %d\n",
-		   result.low2lowcat_treshold);
-	seq_printf(seq, "LowCat2Low treshold   : %d\n",
-		   result.lowcat2low_treshold);
-	seq_printf(seq, "LowWarn2Low treshold  : %d\n",
-		   result.lowwarn2low_treshold);
-	seq_printf(seq, "Low2LowWarn treshold  : %d\n",
-		   result.low2lowwarn_treshold);
-	seq_printf(seq, "Norm2LowWarn treshold : %d\n",
-		   result.norm2lowwarn_treshold);
-	seq_printf(seq, "LowWarn2Norm treshold : %d\n",
-		   result.lowwarn2norm_treshold);
-	seq_printf(seq, "Nominal reading       : %d\n", result.nominal_reading);
-	seq_printf(seq, "HiWarn2Norm treshold  : %d\n",
-		   result.hiwarn2norm_treshold);
-	seq_printf(seq, "Norm2HiWarn treshold  : %d\n",
-		   result.norm2hiwarn_treshold);
-	seq_printf(seq, "High2HiWarn treshold  : %d\n",
-		   result.high2hiwarn_treshold);
-	seq_printf(seq, "HiWarn2High treshold  : %d\n",
-		   result.hiwarn2high_treshold);
-	seq_printf(seq, "HiCat2High treshold   : %d\n",
-		   result.hicat2high_treshold);
-	seq_printf(seq, "High2HiCat treshold   : %d\n",
-		   result.hi2hicat_treshold);
-	seq_printf(seq, "Maximum reading       : %d\n", result.maximum_reading);
-
-	seq_printf(seq, "Sensor state          : %d = ", result.sensor_state);
-	switch (result.sensor_state) {
-	case 0:
-		seq_printf(seq, "Normal\n");
-		break;
-	case 1:
-		seq_printf(seq, "Abnormal\n");
-		break;
-	case 2:
-		seq_printf(seq, "Unknown\n");
-		break;
-	case 3:
-		seq_printf(seq, "Low Catastrophic (LoCat)\n");
-		break;
-	case 4:
-		seq_printf(seq, "Low (Low)\n");
-		break;
-	case 5:
-		seq_printf(seq, "Low Warning (LoWarn)\n");
-		break;
-	case 6:
-		seq_printf(seq, "High Warning (HiWarn)\n");
-		break;
-	case 7:
-		seq_printf(seq, "High (High)\n");
-		break;
-	case 8:
-		seq_printf(seq, "High Catastrophic (HiCat)\n");
-		break;
-	}
-
-	seq_printf(seq, "Event_enable : 0x%02X\n", result.event_enable);
-	seq_printf(seq, "    [%s] Operational state change. \n",
-		   (result.event_enable & 0x01) ? "+" : "-");
-	seq_printf(seq, "    [%s] Low catastrophic. \n",
-		   (result.event_enable & 0x02) ? "+" : "-");
-	seq_printf(seq, "    [%s] Low reading. \n",
-		   (result.event_enable & 0x04) ? "+" : "-");
-	seq_printf(seq, "    [%s] Low warning. \n",
-		   (result.event_enable & 0x08) ? "+" : "-");
-	seq_printf(seq,
-		   "    [%s] Change back to normal from out of range state. \n",
-		   (result.event_enable & 0x10) ? "+" : "-");
-	seq_printf(seq, "    [%s] High warning. \n",
-		   (result.event_enable & 0x20) ? "+" : "-");
-	seq_printf(seq, "    [%s] High reading. \n",
-		   (result.event_enable & 0x40) ? "+" : "-");
-	seq_printf(seq, "    [%s] High catastrophic. \n",
-		   (result.event_enable & 0x80) ? "+" : "-");
-
-	return 0;
-}
-
-static int i2o_seq_open_hrt(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_hrt, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_lct(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_lct, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_status(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_status, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_hw(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_hw, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_ddm_table(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_ddm_table, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_driver_store(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_driver_store, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_drivers_stored(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_drivers_stored, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_groups(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_groups, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_phys_device(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_phys_device, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_claimed(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_claimed, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_users(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_users, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_priv_msgs(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_priv_msgs, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_authorized_users(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_authorized_users,
-			   PDE_DATA(inode));
-};
-
-static int i2o_seq_open_dev_identity(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_dev_identity, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_ddm_identity(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_ddm_identity, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_uinfo(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_uinfo, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_sgl_limits(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_sgl_limits, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_sensors(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_sensors, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_dev_name(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_dev_name, PDE_DATA(inode));
-};
-
-static const struct file_operations i2o_seq_fops_lct = {
-	.open = i2o_seq_open_lct,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_hrt = {
-	.open = i2o_seq_open_hrt,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_status = {
-	.open = i2o_seq_open_status,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_hw = {
-	.open = i2o_seq_open_hw,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_ddm_table = {
-	.open = i2o_seq_open_ddm_table,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_driver_store = {
-	.open = i2o_seq_open_driver_store,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_drivers_stored = {
-	.open = i2o_seq_open_drivers_stored,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_groups = {
-	.open = i2o_seq_open_groups,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_phys_device = {
-	.open = i2o_seq_open_phys_device,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_claimed = {
-	.open = i2o_seq_open_claimed,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_users = {
-	.open = i2o_seq_open_users,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_priv_msgs = {
-	.open = i2o_seq_open_priv_msgs,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_authorized_users = {
-	.open = i2o_seq_open_authorized_users,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_dev_name = {
-	.open = i2o_seq_open_dev_name,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_dev_identity = {
-	.open = i2o_seq_open_dev_identity,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_ddm_identity = {
-	.open = i2o_seq_open_ddm_identity,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_uinfo = {
-	.open = i2o_seq_open_uinfo,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_sgl_limits = {
-	.open = i2o_seq_open_sgl_limits,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_sensors = {
-	.open = i2o_seq_open_sensors,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-/*
- * IOP specific entries...write field just in case someone
- * ever wants one.
- */
-static i2o_proc_entry i2o_proc_generic_iop_entries[] = {
-	{"hrt", S_IFREG | S_IRUGO, &i2o_seq_fops_hrt},
-	{"lct", S_IFREG | S_IRUGO, &i2o_seq_fops_lct},
-	{"status", S_IFREG | S_IRUGO, &i2o_seq_fops_status},
-	{"hw", S_IFREG | S_IRUGO, &i2o_seq_fops_hw},
-	{"ddm_table", S_IFREG | S_IRUGO, &i2o_seq_fops_ddm_table},
-	{"driver_store", S_IFREG | S_IRUGO, &i2o_seq_fops_driver_store},
-	{"drivers_stored", S_IFREG | S_IRUGO, &i2o_seq_fops_drivers_stored},
-	{NULL, 0, NULL}
-};
-
-/*
- * Device specific entries
- */
-static i2o_proc_entry generic_dev_entries[] = {
-	{"groups", S_IFREG | S_IRUGO, &i2o_seq_fops_groups},
-	{"phys_dev", S_IFREG | S_IRUGO, &i2o_seq_fops_phys_device},
-	{"claimed", S_IFREG | S_IRUGO, &i2o_seq_fops_claimed},
-	{"users", S_IFREG | S_IRUGO, &i2o_seq_fops_users},
-	{"priv_msgs", S_IFREG | S_IRUGO, &i2o_seq_fops_priv_msgs},
-	{"authorized_users", S_IFREG | S_IRUGO, &i2o_seq_fops_authorized_users},
-	{"dev_identity", S_IFREG | S_IRUGO, &i2o_seq_fops_dev_identity},
-	{"ddm_identity", S_IFREG | S_IRUGO, &i2o_seq_fops_ddm_identity},
-	{"user_info", S_IFREG | S_IRUGO, &i2o_seq_fops_uinfo},
-	{"sgl_limits", S_IFREG | S_IRUGO, &i2o_seq_fops_sgl_limits},
-	{"sensors", S_IFREG | S_IRUGO, &i2o_seq_fops_sensors},
-	{NULL, 0, NULL}
-};
-
-/*
- *  Storage unit specific entries (SCSI Periph, BS) with device names
- */
-static i2o_proc_entry rbs_dev_entries[] = {
-	{"dev_name", S_IFREG | S_IRUGO, &i2o_seq_fops_dev_name},
-	{NULL, 0, NULL}
-};
-
-/**
- *	i2o_proc_create_entries - Creates proc dir entries
- *	@dir: proc dir entry under which the entries should be placed
- *	@i2o_pe: pointer to the entries which should be added
- *	@data: pointer to I2O controller or device
- *
- *	Create proc dir entries for a I2O controller or I2O device.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_proc_create_entries(struct proc_dir_entry *dir,
-				   i2o_proc_entry * i2o_pe, void *data)
-{
-	struct proc_dir_entry *tmp;
-
-	while (i2o_pe->name) {
-		tmp = proc_create_data(i2o_pe->name, i2o_pe->mode, dir,
-				       i2o_pe->fops, data);
-		if (!tmp)
-			return -1;
-
-		i2o_pe++;
-	}
-
-	return 0;
-}
-
-/**
- *	i2o_proc_device_add - Add an I2O device to the proc dir
- *	@dir: proc dir entry to which the device should be added
- *	@dev: I2O device which should be added
- *
- *	Add an I2O device to the proc dir entry dir and create the entries for
- *	the device depending on the class of the I2O device.
- */
-static void i2o_proc_device_add(struct proc_dir_entry *dir,
-				struct i2o_device *dev)
-{
-	char buff[10];
-	struct proc_dir_entry *devdir;
-	i2o_proc_entry *i2o_pe = NULL;
-
-	sprintf(buff, "%03x", dev->lct_data.tid);
-
-	osm_debug("adding device /proc/i2o/%s/%s\n", dev->iop->name, buff);
-
-	devdir = proc_mkdir_data(buff, 0, dir, dev);
-	if (!devdir) {
-		osm_warn("Could not allocate procdir!\n");
-		return;
-	}
-
-	i2o_proc_create_entries(devdir, generic_dev_entries, dev);
-
-	/* Inform core that we want updates about this device's status */
-	switch (dev->lct_data.class_id) {
-	case I2O_CLASS_SCSI_PERIPHERAL:
-	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-		i2o_pe = rbs_dev_entries;
-		break;
-	default:
-		break;
-	}
-	if (i2o_pe)
-		i2o_proc_create_entries(devdir, i2o_pe, dev);
-}
-
-/**
- *	i2o_proc_iop_add - Add an I2O controller to the i2o proc tree
- *	@dir: parent proc dir entry
- *	@c: I2O controller which should be added
- *
- *	Add the entries to the parent proc dir entry. Also each device is added
- *	to the controllers proc dir entry.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_proc_iop_add(struct proc_dir_entry *dir,
-			    struct i2o_controller *c)
-{
-	struct proc_dir_entry *iopdir;
-	struct i2o_device *dev;
-
-	osm_debug("adding IOP /proc/i2o/%s\n", c->name);
-
-	iopdir = proc_mkdir_data(c->name, 0, dir, c);
-	if (!iopdir)
-		return -1;
-
-	i2o_proc_create_entries(iopdir, i2o_proc_generic_iop_entries, c);
-
-	list_for_each_entry(dev, &c->devices, list)
-	    i2o_proc_device_add(iopdir, dev);
-
-	return 0;
-}
-
-/**
- *	i2o_proc_fs_create - Create the i2o proc fs.
- *
- *	Iterate over each I2O controller and create the entries for it.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_proc_fs_create(void)
-{
-	struct i2o_controller *c;
-
-	i2o_proc_dir_root = proc_mkdir("i2o", NULL);
-	if (!i2o_proc_dir_root)
-		return -1;
-
-	list_for_each_entry(c, &i2o_controllers, list)
-	    i2o_proc_iop_add(i2o_proc_dir_root, c);
-
-	return 0;
-};
-
-/**
- *	i2o_proc_fs_destroy - Cleanup the all i2o proc entries
- *
- *	Iterate over each I2O controller and remove the entries for it.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __exit i2o_proc_fs_destroy(void)
-{
-	remove_proc_subtree("i2o", NULL);
-
-	return 0;
-};
-
-/**
- *	i2o_proc_init - Init function for procfs
- *
- *	Registers Proc OSM and creates procfs entries.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_proc_init(void)
-{
-	int rc;
-
-	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
-
-	rc = i2o_driver_register(&i2o_proc_driver);
-	if (rc)
-		return rc;
-
-	rc = i2o_proc_fs_create();
-	if (rc) {
-		i2o_driver_unregister(&i2o_proc_driver);
-		return rc;
-	}
-
-	return 0;
-};
-
-/**
- *	i2o_proc_exit - Exit function for procfs
- *
- *	Unregisters Proc OSM and removes procfs entries.
- */
-static void __exit i2o_proc_exit(void)
-{
-	i2o_driver_unregister(&i2o_proc_driver);
-	i2o_proc_fs_destroy();
-};
-
-MODULE_AUTHOR("Deepak Saxena");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(OSM_DESCRIPTION);
-MODULE_VERSION(OSM_VERSION);
-
-module_init(i2o_proc_init);
-module_exit(i2o_proc_exit);
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
deleted file mode 100644
index 8152e9fa9d95..000000000000
--- a/drivers/message/i2o/i2o_scsi.c
+++ /dev/null
@@ -1,814 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * For the avoidance of doubt the "preferred form" of this code is one which
- * is in an open non patent encumbered format. Where cryptographic key signing
- * forms part of the process of creating an executable the information
- * including keys needed to generate an equivalently functional executable
- * are deemed to be part of the source code.
- *
- *  Complications for I2O scsi
- *
- *	o	Each (bus,lun) is a logical device in I2O. We keep a map
- *		table. We spoof failed selection for unmapped units
- *	o	Request sense buffers can come back for free.
- *	o	Scatter gather is a bit dynamic. We have to investigate at
- *		setup time.
- *	o	Some of our resources are dynamically shared. The i2o core
- *		needs a message reservation protocol to avoid swap v net
- *		deadlocking. We need to back off queue requests.
- *
- *	In general the firmware wants to help. Where its help isn't performance
- *	useful we just ignore the aid. Its not worth the code in truth.
- *
- * Fixes/additions:
- *	Steve Ralston:
- *		Scatter gather now works
- *	Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *		Minor fixes for 2.6.
- *
- * To Do:
- *	64bit cleanups
- *	Fix the resource management problems.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/ioport.h>
-#include <linux/jiffies.h>
-#include <linux/interrupt.h>
-#include <linux/timer.h>
-#include <linux/delay.h>
-#include <linux/proc_fs.h>
-#include <linux/prefetch.h>
-#include <linux/pci.h>
-#include <linux/blkdev.h>
-#include <linux/i2o.h>
-#include <linux/scatterlist.h>
-
-#include <asm/dma.h>
-#include <asm/io.h>
-#include <linux/atomic.h>
-
-#include <scsi/scsi.h>
-#include <scsi/scsi_host.h>
-#include <scsi/scsi_device.h>
-#include <scsi/scsi_cmnd.h>
-#include <scsi/sg.h>
-
-#define OSM_NAME	"scsi-osm"
-#define OSM_VERSION	"1.316"
-#define OSM_DESCRIPTION	"I2O SCSI Peripheral OSM"
-
-static struct i2o_driver i2o_scsi_driver;
-
-static unsigned int i2o_scsi_max_id = 16;
-static unsigned int i2o_scsi_max_lun = 255;
-
-struct i2o_scsi_host {
-	struct Scsi_Host *scsi_host;	/* pointer to the SCSI host */
-	struct i2o_controller *iop;	/* pointer to the I2O controller */
-	u64 lun;	/* lun's used for block devices */
-	struct i2o_device *channel[0];	/* channel->i2o_dev mapping table */
-};
-
-static struct scsi_host_template i2o_scsi_host_template;
-
-#define I2O_SCSI_CAN_QUEUE	4
-
-/* SCSI OSM class handling definition */
-static struct i2o_class_id i2o_scsi_class_id[] = {
-	{I2O_CLASS_SCSI_PERIPHERAL},
-	{I2O_CLASS_END}
-};
-
-static struct i2o_scsi_host *i2o_scsi_host_alloc(struct i2o_controller *c)
-{
-	struct i2o_scsi_host *i2o_shost;
-	struct i2o_device *i2o_dev;
-	struct Scsi_Host *scsi_host;
-	int max_channel = 0;
-	u8 type;
-	int i;
-	size_t size;
-	u16 body_size = 6;
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-	if (c->adaptec)
-		body_size = 8;
-#endif
-
-	list_for_each_entry(i2o_dev, &c->devices, list)
-	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
-		if (!i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
-		    && (type == 0x01))	/* SCSI bus */
-			max_channel++;
-	}
-
-	if (!max_channel) {
-		osm_warn("no channels found on %s\n", c->name);
-		return ERR_PTR(-EFAULT);
-	}
-
-	size = max_channel * sizeof(struct i2o_device *)
-	    + sizeof(struct i2o_scsi_host);
-
-	scsi_host = scsi_host_alloc(&i2o_scsi_host_template, size);
-	if (!scsi_host) {
-		osm_warn("Could not allocate SCSI host\n");
-		return ERR_PTR(-ENOMEM);
-	}
-
-	scsi_host->max_channel = max_channel - 1;
-	scsi_host->max_id = i2o_scsi_max_id;
-	scsi_host->max_lun = i2o_scsi_max_lun;
-	scsi_host->this_id = c->unit;
-	scsi_host->sg_tablesize = i2o_sg_tablesize(c, body_size);
-
-	i2o_shost = (struct i2o_scsi_host *)scsi_host->hostdata;
-	i2o_shost->scsi_host = scsi_host;
-	i2o_shost->iop = c;
-	i2o_shost->lun = 1;
-
-	i = 0;
-	list_for_each_entry(i2o_dev, &c->devices, list)
-	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
-		if (!i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
-		    && (type == 0x01))	/* only SCSI bus */
-			i2o_shost->channel[i++] = i2o_dev;
-
-		if (i >= max_channel)
-			break;
-	}
-
-	return i2o_shost;
-};
-
-/**
- *	i2o_scsi_get_host - Get an I2O SCSI host
- *	@c: I2O controller to for which to get the SCSI host
- *
- *	If the I2O controller already exists as SCSI host, the SCSI host
- *	is returned, otherwise the I2O controller is added to the SCSI
- *	core.
- *
- *	Returns pointer to the I2O SCSI host on success or NULL on failure.
- */
-static struct i2o_scsi_host *i2o_scsi_get_host(struct i2o_controller *c)
-{
-	return c->driver_data[i2o_scsi_driver.context];
-};
-
-/**
- *	i2o_scsi_remove - Remove I2O device from SCSI core
- *	@dev: device which should be removed
- *
- *	Removes the I2O device from the SCSI core again.
- *
- *	Returns 0 on success.
- */
-static int i2o_scsi_remove(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_scsi_host *i2o_shost;
-	struct scsi_device *scsi_dev;
-
-	osm_info("device removed (TID: %03x)\n", i2o_dev->lct_data.tid);
-
-	i2o_shost = i2o_scsi_get_host(c);
-
-	shost_for_each_device(scsi_dev, i2o_shost->scsi_host)
-	    if (scsi_dev->hostdata == i2o_dev) {
-		sysfs_remove_link(&i2o_dev->device.kobj, "scsi");
-		scsi_remove_device(scsi_dev);
-		scsi_device_put(scsi_dev);
-		break;
-	}
-
-	return 0;
-};
-
-/**
- *	i2o_scsi_probe - verify if dev is a I2O SCSI device and install it
- *	@dev: device to verify if it is a I2O SCSI device
- *
- *	Retrieve channel, id and lun for I2O device. If everything goes well
- *	register the I2O device as SCSI device on the I2O SCSI controller.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_scsi_probe(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_scsi_host *i2o_shost;
-	struct Scsi_Host *scsi_host;
-	struct i2o_device *parent;
-	struct scsi_device *scsi_dev;
-	u32 id = -1;
-	u64 lun = -1;
-	int channel = -1;
-	int i, rc;
-
-	i2o_shost = i2o_scsi_get_host(c);
-	if (!i2o_shost)
-		return -EFAULT;
-
-	scsi_host = i2o_shost->scsi_host;
-
-	switch (i2o_dev->lct_data.class_id) {
-	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-	case I2O_CLASS_EXECUTIVE:
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-		if (c->adaptec) {
-			u8 type;
-			struct i2o_device *d = i2o_shost->channel[0];
-
-			if (!i2o_parm_field_get(d, 0x0000, 0, &type, 1)
-			    && (type == 0x01))	/* SCSI bus */
-				if (!i2o_parm_field_get(d, 0x0200, 4, &id, 4)) {
-					channel = 0;
-					if (i2o_dev->lct_data.class_id ==
-					    I2O_CLASS_RANDOM_BLOCK_STORAGE)
-						lun =
-						    cpu_to_le64(i2o_shost->
-								lun++);
-					else
-						lun = 0;
-				}
-		}
-#endif
-		break;
-
-	case I2O_CLASS_SCSI_PERIPHERAL:
-		if (i2o_parm_field_get(i2o_dev, 0x0000, 3, &id, 4))
-			return -EFAULT;
-
-		if (i2o_parm_field_get(i2o_dev, 0x0000, 4, &lun, 8))
-			return -EFAULT;
-
-		parent = i2o_iop_find_device(c, i2o_dev->lct_data.parent_tid);
-		if (!parent) {
-			osm_warn("can not find parent of device %03x\n",
-				 i2o_dev->lct_data.tid);
-			return -EFAULT;
-		}
-
-		for (i = 0; i <= i2o_shost->scsi_host->max_channel; i++)
-			if (i2o_shost->channel[i] == parent)
-				channel = i;
-		break;
-
-	default:
-		return -EFAULT;
-	}
-
-	if (channel == -1) {
-		osm_warn("can not find channel of device %03x\n",
-			 i2o_dev->lct_data.tid);
-		return -EFAULT;
-	}
-
-	if (le32_to_cpu(id) >= scsi_host->max_id) {
-		osm_warn("SCSI device id (%d) >= max_id of I2O host (%d)",
-			 le32_to_cpu(id), scsi_host->max_id);
-		return -EFAULT;
-	}
-
-	if (le64_to_cpu(lun) >= scsi_host->max_lun) {
-		osm_warn("SCSI device lun (%llu) >= max_lun of I2O host (%llu)",
-			 le64_to_cpu(lun), scsi_host->max_lun);
-		return -EFAULT;
-	}
-
-	scsi_dev =
-	    __scsi_add_device(i2o_shost->scsi_host, channel, le32_to_cpu(id),
-			      le64_to_cpu(lun), i2o_dev);
-
-	if (IS_ERR(scsi_dev)) {
-		osm_warn("can not add SCSI device %03x\n",
-			 i2o_dev->lct_data.tid);
-		return PTR_ERR(scsi_dev);
-	}
-
-	rc = sysfs_create_link(&i2o_dev->device.kobj,
-			       &scsi_dev->sdev_gendev.kobj, "scsi");
-	if (rc)
-		goto err;
-
-	osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %llu\n",
-		 i2o_dev->lct_data.tid, channel, le32_to_cpu(id),
-		 le64_to_cpu(lun));
-
-	return 0;
-
-err:
-	scsi_remove_device(scsi_dev);
-	return rc;
-};
-
-static const char *i2o_scsi_info(struct Scsi_Host *SChost)
-{
-	struct i2o_scsi_host *hostdata;
-	hostdata = (struct i2o_scsi_host *)SChost->hostdata;
-	return hostdata->iop->name;
-}
-
-/**
- *	i2o_scsi_reply - SCSI OSM message reply handler
- *	@c: controller issuing the reply
- *	@m: message id for flushing
- *	@msg: the message from the controller
- *
- *	Process reply messages (interrupts in normal scsi controller think).
- *	We can get a variety of messages to process. The normal path is
- *	scsi command completions. We must also deal with IOP failures,
- *	the reply to a bus reset and the reply to a LUN query.
- *
- *	Returns 0 on success and if the reply should not be flushed or > 0
- *	on success and if the reply should be flushed. Returns negative error
- *	code on failure and if the reply should be flushed.
- */
-static int i2o_scsi_reply(struct i2o_controller *c, u32 m,
-			  struct i2o_message *msg)
-{
-	struct scsi_cmnd *cmd;
-	u32 error;
-	struct device *dev;
-
-	cmd = i2o_cntxt_list_get(c, le32_to_cpu(msg->u.s.tcntxt));
-	if (unlikely(!cmd)) {
-		osm_err("NULL reply received!\n");
-		return -1;
-	}
-
-	/*
-	 *      Low byte is device status, next is adapter status,
-	 *      (then one byte reserved), then request status.
-	 */
-	error = le32_to_cpu(msg->body[0]);
-
-	osm_debug("Completed %0x%p\n", cmd);
-
-	cmd->result = error & 0xff;
-	/*
-	 * if DeviceStatus is not SCSI_SUCCESS copy over the sense data and let
-	 * the SCSI layer handle the error
-	 */
-	if (cmd->result)
-		memcpy(cmd->sense_buffer, &msg->body[3],
-		       min(SCSI_SENSE_BUFFERSIZE, 40));
-
-	/* only output error code if AdapterStatus is not HBA_SUCCESS */
-	if ((error >> 8) & 0xff)
-		osm_err("SCSI error %08x\n", error);
-
-	dev = &c->pdev->dev;
-
-	scsi_dma_unmap(cmd);
-
-	cmd->scsi_done(cmd);
-
-	return 1;
-};
-
-/**
- *	i2o_scsi_notify_device_add - Retrieve notifications of added devices
- *	@i2o_dev: the I2O device which was added
- *
- *	If a I2O device is added we catch the notification, because I2O classes
- *	other than SCSI peripheral will not be received through
- *	i2o_scsi_probe().
- */
-static void i2o_scsi_notify_device_add(struct i2o_device *i2o_dev)
-{
-	switch (i2o_dev->lct_data.class_id) {
-	case I2O_CLASS_EXECUTIVE:
-	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-		i2o_scsi_probe(&i2o_dev->device);
-		break;
-
-	default:
-		break;
-	}
-};
-
-/**
- *	i2o_scsi_notify_device_remove - Retrieve notifications of removed devices
- *	@i2o_dev: the I2O device which was removed
- *
- *	If a I2O device is removed, we catch the notification to remove the
- *	corresponding SCSI device.
- */
-static void i2o_scsi_notify_device_remove(struct i2o_device *i2o_dev)
-{
-	switch (i2o_dev->lct_data.class_id) {
-	case I2O_CLASS_EXECUTIVE:
-	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-		i2o_scsi_remove(&i2o_dev->device);
-		break;
-
-	default:
-		break;
-	}
-};
-
-/**
- *	i2o_scsi_notify_controller_add - Retrieve notifications of added controllers
- *	@c: the controller which was added
- *
- *	If a I2O controller is added, we catch the notification to add a
- *	corresponding Scsi_Host.
- */
-static void i2o_scsi_notify_controller_add(struct i2o_controller *c)
-{
-	struct i2o_scsi_host *i2o_shost;
-	int rc;
-
-	i2o_shost = i2o_scsi_host_alloc(c);
-	if (IS_ERR(i2o_shost)) {
-		osm_err("Could not initialize SCSI host\n");
-		return;
-	}
-
-	rc = scsi_add_host(i2o_shost->scsi_host, &c->device);
-	if (rc) {
-		osm_err("Could not add SCSI host\n");
-		scsi_host_put(i2o_shost->scsi_host);
-		return;
-	}
-
-	c->driver_data[i2o_scsi_driver.context] = i2o_shost;
-
-	osm_debug("new I2O SCSI host added\n");
-};
-
-/**
- *	i2o_scsi_notify_controller_remove - Retrieve notifications of removed controllers
- *	@c: the controller which was removed
- *
- *	If a I2O controller is removed, we catch the notification to remove the
- *	corresponding Scsi_Host.
- */
-static void i2o_scsi_notify_controller_remove(struct i2o_controller *c)
-{
-	struct i2o_scsi_host *i2o_shost;
-	i2o_shost = i2o_scsi_get_host(c);
-	if (!i2o_shost)
-		return;
-
-	c->driver_data[i2o_scsi_driver.context] = NULL;
-
-	scsi_remove_host(i2o_shost->scsi_host);
-	scsi_host_put(i2o_shost->scsi_host);
-	osm_debug("I2O SCSI host removed\n");
-};
-
-/* SCSI OSM driver struct */
-static struct i2o_driver i2o_scsi_driver = {
-	.name = OSM_NAME,
-	.reply = i2o_scsi_reply,
-	.classes = i2o_scsi_class_id,
-	.notify_device_add = i2o_scsi_notify_device_add,
-	.notify_device_remove = i2o_scsi_notify_device_remove,
-	.notify_controller_add = i2o_scsi_notify_controller_add,
-	.notify_controller_remove = i2o_scsi_notify_controller_remove,
-	.driver = {
-		   .probe = i2o_scsi_probe,
-		   .remove = i2o_scsi_remove,
-		   },
-};
-
-/**
- *	i2o_scsi_queuecommand - queue a SCSI command
- *	@SCpnt: scsi command pointer
- *	@done: callback for completion
- *
- *	Issue a scsi command asynchronously. Return 0 on success or 1 if
- *	we hit an error (normally message queue congestion). The only
- *	minor complication here is that I2O deals with the device addressing
- *	so we have to map the bus/dev/lun back to an I2O handle as well
- *	as faking absent devices ourself.
- *
- *	Locks: takes the controller lock on error path only
- */
-
-static int i2o_scsi_queuecommand_lck(struct scsi_cmnd *SCpnt,
-				 void (*done) (struct scsi_cmnd *))
-{
-	struct i2o_controller *c;
-	struct i2o_device *i2o_dev;
-	int tid;
-	struct i2o_message *msg;
-	/*
-	 * ENABLE_DISCONNECT
-	 * SIMPLE_TAG
-	 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME
-	 */
-	u32 scsi_flags = 0x20a00000;
-	u32 sgl_offset;
-	u32 *mptr;
-	u32 cmd = I2O_CMD_SCSI_EXEC << 24;
-	int rc = 0;
-
-	/*
-	 *      Do the incoming paperwork
-	 */
-	i2o_dev = SCpnt->device->hostdata;
-
-	SCpnt->scsi_done = done;
-
-	if (unlikely(!i2o_dev)) {
-		osm_warn("no I2O device in request\n");
-		SCpnt->result = DID_NO_CONNECT << 16;
-		done(SCpnt);
-		goto exit;
-	}
-	c = i2o_dev->iop;
-	tid = i2o_dev->lct_data.tid;
-
-	osm_debug("qcmd: Tid = %03x\n", tid);
-	osm_debug("Real scsi messages.\n");
-
-	/*
-	 *      Put together a scsi execscb message
-	 */
-	switch (SCpnt->sc_data_direction) {
-	case PCI_DMA_NONE:
-		/* DATA NO XFER */
-		sgl_offset = SGL_OFFSET_0;
-		break;
-
-	case PCI_DMA_TODEVICE:
-		/* DATA OUT (iop-->dev) */
-		scsi_flags |= 0x80000000;
-		sgl_offset = SGL_OFFSET_10;
-		break;
-
-	case PCI_DMA_FROMDEVICE:
-		/* DATA IN  (iop<--dev) */
-		scsi_flags |= 0x40000000;
-		sgl_offset = SGL_OFFSET_10;
-		break;
-
-	default:
-		/* Unknown - kill the command */
-		SCpnt->result = DID_NO_CONNECT << 16;
-		done(SCpnt);
-		goto exit;
-	}
-
-	/*
-	 *      Obtain an I2O message. If there are none free then
-	 *      throw it back to the scsi layer
-	 */
-
-	msg = i2o_msg_get(c);
-	if (IS_ERR(msg)) {
-		rc = SCSI_MLQUEUE_HOST_BUSY;
-		goto exit;
-	}
-
-	mptr = &msg->body[0];
-
-#if 0 /* this code can't work */
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-	if (c->adaptec) {
-		u32 adpt_flags = 0;
-
-		if (SCpnt->sc_request && SCpnt->sc_request->upper_private_data) {
-			i2o_sg_io_hdr_t __user *usr_ptr =
-			    ((Sg_request *) (SCpnt->sc_request->
-					     upper_private_data))->header.
-			    usr_ptr;
-
-			if (usr_ptr)
-				get_user(adpt_flags, &usr_ptr->flags);
-		}
-
-		switch (i2o_dev->lct_data.class_id) {
-		case I2O_CLASS_EXECUTIVE:
-		case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-			/* interpret flag has to be set for executive */
-			adpt_flags ^= I2O_DPT_SG_FLAG_INTERPRET;
-			break;
-
-		default:
-			break;
-		}
-
-		/*
-		 * for Adaptec controllers we use the PRIVATE command, because
-		 * the normal SCSI EXEC doesn't support all SCSI commands on
-		 * all controllers (for example READ CAPACITY).
-		 */
-		if (sgl_offset == SGL_OFFSET_10)
-			sgl_offset = SGL_OFFSET_12;
-		cmd = I2O_CMD_PRIVATE << 24;
-		*mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
-		*mptr++ = cpu_to_le32(adpt_flags | tid);
-	}
-#endif
-#endif
-
-	msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
-	msg->u.s.icntxt = cpu_to_le32(i2o_scsi_driver.context);
-
-	/* We want the SCSI control block back */
-	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, SCpnt));
-
-	/* LSI_920_PCI_QUIRK
-	 *
-	 *      Intermittant observations of msg frame word data corruption
-	 *      observed on msg[4] after:
-	 *        WRITE, READ-MODIFY-WRITE
-	 *      operations.  19990606 -sralston
-	 *
-	 *      (Hence we build this word via tag. Its good practice anyway
-	 *       we don't want fetches over PCI needlessly)
-	 */
-
-	/* Attach tags to the devices */
-	/* FIXME: implement
-	   if(SCpnt->device->tagged_supported) {
-	   if(SCpnt->tag == HEAD_OF_QUEUE_TAG)
-	   scsi_flags |= 0x01000000;
-	   else if(SCpnt->tag == ORDERED_QUEUE_TAG)
-	   scsi_flags |= 0x01800000;
-	   }
-	 */
-
-	*mptr++ = cpu_to_le32(scsi_flags | SCpnt->cmd_len);
-
-	/* Write SCSI command into the message - always 16 byte block */
-	memcpy(mptr, SCpnt->cmnd, 16);
-	mptr += 4;
-
-	if (sgl_offset != SGL_OFFSET_0) {
-		/* write size of data addressed by SGL */
-		*mptr++ = cpu_to_le32(scsi_bufflen(SCpnt));
-
-		/* Now fill in the SGList and command */
-
-		if (scsi_sg_count(SCpnt)) {
-			if (!i2o_dma_map_sg(c, scsi_sglist(SCpnt),
-					    scsi_sg_count(SCpnt),
-					    SCpnt->sc_data_direction, &mptr))
-				goto nomem;
-		}
-	}
-
-	/* Stick the headers on */
-	msg->u.head[0] =
-	    cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
-
-	/* Queue the message */
-	i2o_msg_post(c, msg);
-
-	osm_debug("Issued %0x%p\n", SCpnt);
-
-	return 0;
-
-      nomem:
-	rc = -ENOMEM;
-	i2o_msg_nop(c, msg);
-
-      exit:
-	return rc;
-}
-
-static DEF_SCSI_QCMD(i2o_scsi_queuecommand)
-
-/**
- *	i2o_scsi_abort - abort a running command
- *	@SCpnt: command to abort
- *
- *	Ask the I2O controller to abort a command. This is an asynchrnous
- *	process and our callback handler will see the command complete with an
- *	aborted message if it succeeds.
- *
- *	Returns 0 if the command is successfully aborted or negative error code
- *	on failure.
- */
-static int i2o_scsi_abort(struct scsi_cmnd *SCpnt)
-{
-	struct i2o_device *i2o_dev;
-	struct i2o_controller *c;
-	struct i2o_message *msg;
-	int tid;
-	int status = FAILED;
-
-	osm_warn("Aborting command block.\n");
-
-	i2o_dev = SCpnt->device->hostdata;
-	c = i2o_dev->iop;
-	tid = i2o_dev->lct_data.tid;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return SCSI_MLQUEUE_HOST_BUSY;
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SCSI_ABORT << 24 | HOST_TID << 12 | tid);
-	msg->body[0] = cpu_to_le32(i2o_cntxt_list_get_ptr(c, SCpnt));
-
-	if (!i2o_msg_post_wait(c, msg, I2O_TIMEOUT_SCSI_SCB_ABORT))
-		status = SUCCESS;
-
-	return status;
-}
-
-/**
- *	i2o_scsi_bios_param	-	Invent disk geometry
- *	@sdev: scsi device
- *	@dev: block layer device
- *	@capacity: size in sectors
- *	@ip: geometry array
- *
- *	This is anyone's guess quite frankly. We use the same rules everyone
- *	else appears to and hope. It seems to work.
- */
-
-static int i2o_scsi_bios_param(struct scsi_device *sdev,
-			       struct block_device *dev, sector_t capacity,
-			       int *ip)
-{
-	int size;
-
-	size = capacity;
-	ip[0] = 64;		/* heads                        */
-	ip[1] = 32;		/* sectors                      */
-	if ((ip[2] = size >> 11) > 1024) {	/* cylinders, test for big disk */
-		ip[0] = 255;	/* heads                        */
-		ip[1] = 63;	/* sectors                      */
-		ip[2] = size / (255 * 63);	/* cylinders                    */
-	}
-	return 0;
-}
-
-static struct scsi_host_template i2o_scsi_host_template = {
-	.proc_name = OSM_NAME,
-	.name = OSM_DESCRIPTION,
-	.info = i2o_scsi_info,
-	.queuecommand = i2o_scsi_queuecommand,
-	.eh_abort_handler = i2o_scsi_abort,
-	.bios_param = i2o_scsi_bios_param,
-	.can_queue = I2O_SCSI_CAN_QUEUE,
-	.sg_tablesize = 8,
-	.cmd_per_lun = 6,
-	.use_clustering = ENABLE_CLUSTERING,
-};
-
-/**
- *	i2o_scsi_init - SCSI OSM initialization function
- *
- *	Register SCSI OSM into I2O core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_scsi_init(void)
-{
-	int rc;
-
-	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
-
-	/* Register SCSI OSM into I2O core */
-	rc = i2o_driver_register(&i2o_scsi_driver);
-	if (rc) {
-		osm_err("Could not register SCSI driver\n");
-		return rc;
-	}
-
-	return 0;
-};
-
-/**
- *	i2o_scsi_exit - SCSI OSM exit function
- *
- *	Unregisters SCSI OSM from I2O core.
- */
-static void __exit i2o_scsi_exit(void)
-{
-	/* Unregister I2O SCSI OSM from I2O core */
-	i2o_driver_unregister(&i2o_scsi_driver);
-};
-
-MODULE_AUTHOR("Red Hat Software");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(OSM_DESCRIPTION);
-MODULE_VERSION(OSM_VERSION);
-
-module_init(i2o_scsi_init);
-module_exit(i2o_scsi_exit);
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
deleted file mode 100644
index 92752fb5b2d3..000000000000
--- a/drivers/message/i2o/iop.c
+++ /dev/null
@@ -1,1247 +0,0 @@
-/*
- *	Functions to handle I2O controllers and I2O message handling
- *
- *	Copyright (C) 1999-2002	Red Hat Software
- *
- *	Written by Alan Cox, Building Number Three Ltd
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	A lot of the I2O message side code from this is taken from the
- *	Red Creek RCPCI45 adapter driver by Red Creek Communications
- *
- *	Fixes/additions:
- *		Philipp Rumpf
- *		Juha Sievänen <Juha.Sievanen@cs.Helsinki.FI>
- *		Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
- *		Deepak Saxena <deepak@plexity.net>
- *		Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
- *		Alan Cox <alan@lxorguk.ukuu.org.uk>:
- *			Ported to Linux 2.5.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Minor fixes for 2.6.
- */
-
-#include <linux/module.h>
-#include <linux/i2o.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include "core.h"
-
-#define OSM_NAME	"i2o"
-#define OSM_VERSION	"1.325"
-#define OSM_DESCRIPTION	"I2O subsystem"
-
-/* global I2O controller list */
-LIST_HEAD(i2o_controllers);
-
-/*
- * global I2O System Table. Contains information about all the IOPs in the
- * system. Used to inform IOPs about each others existence.
- */
-static struct i2o_dma i2o_systab;
-
-static int i2o_hrt_get(struct i2o_controller *c);
-
-/**
- *	i2o_msg_get_wait - obtain an I2O message from the IOP
- *	@c: I2O controller
- *	@wait: how long to wait until timeout
- *
- *	This function waits up to wait seconds for a message slot to be
- *	available.
- *
- *	On a success the message is returned and the pointer to the message is
- *	set in msg. The returned message is the physical page frame offset
- *	address from the read port (see the i2o spec). If no message is
- *	available returns I2O_QUEUE_EMPTY and msg is leaved untouched.
- */
-struct i2o_message *i2o_msg_get_wait(struct i2o_controller *c, int wait)
-{
-	unsigned long timeout = jiffies + wait * HZ;
-	struct i2o_message *msg;
-
-	while (IS_ERR(msg = i2o_msg_get(c))) {
-		if (time_after(jiffies, timeout)) {
-			osm_debug("%s: Timeout waiting for message frame.\n",
-				  c->name);
-			return ERR_PTR(-ETIMEDOUT);
-		}
-		schedule_timeout_uninterruptible(1);
-	}
-
-	return msg;
-};
-
-#if BITS_PER_LONG == 64
-/**
- *      i2o_cntxt_list_add - Append a pointer to context list and return a id
- *	@c: controller to which the context list belong
- *	@ptr: pointer to add to the context list
- *
- *	Because the context field in I2O is only 32-bit large, on 64-bit the
- *	pointer is to large to fit in the context field. The i2o_cntxt_list
- *	functions therefore map pointers to context fields.
- *
- *	Returns context id > 0 on success or 0 on failure.
- */
-u32 i2o_cntxt_list_add(struct i2o_controller * c, void *ptr)
-{
-	struct i2o_context_list_element *entry;
-	unsigned long flags;
-
-	if (!ptr)
-		osm_err("%s: couldn't add NULL pointer to context list!\n",
-			c->name);
-
-	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
-	if (!entry) {
-		osm_err("%s: Could not allocate memory for context list element"
-			"\n", c->name);
-		return 0;
-	}
-
-	entry->ptr = ptr;
-	entry->timestamp = jiffies;
-	INIT_LIST_HEAD(&entry->list);
-
-	spin_lock_irqsave(&c->context_list_lock, flags);
-
-	if (unlikely(atomic_inc_and_test(&c->context_list_counter)))
-		atomic_inc(&c->context_list_counter);
-
-	entry->context = atomic_read(&c->context_list_counter);
-
-	list_add(&entry->list, &c->context_list);
-
-	spin_unlock_irqrestore(&c->context_list_lock, flags);
-
-	osm_debug("%s: Add context to list %p -> %d\n", c->name, ptr, context);
-
-	return entry->context;
-};
-
-/**
- *      i2o_cntxt_list_remove - Remove a pointer from the context list
- *	@c: controller to which the context list belong
- *	@ptr: pointer which should be removed from the context list
- *
- *	Removes a previously added pointer from the context list and returns
- *	the matching context id.
- *
- *	Returns context id on success or 0 on failure.
- */
-u32 i2o_cntxt_list_remove(struct i2o_controller * c, void *ptr)
-{
-	struct i2o_context_list_element *entry;
-	u32 context = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&c->context_list_lock, flags);
-	list_for_each_entry(entry, &c->context_list, list)
-	    if (entry->ptr == ptr) {
-		list_del(&entry->list);
-		context = entry->context;
-		kfree(entry);
-		break;
-	}
-	spin_unlock_irqrestore(&c->context_list_lock, flags);
-
-	if (!context)
-		osm_warn("%s: Could not remove nonexistent ptr %p\n", c->name,
-			 ptr);
-
-	osm_debug("%s: remove ptr from context list %d -> %p\n", c->name,
-		  context, ptr);
-
-	return context;
-};
-
-/**
- *      i2o_cntxt_list_get - Get a pointer from the context list and remove it
- *	@c: controller to which the context list belong
- *	@context: context id to which the pointer belong
- *
- *	Returns pointer to the matching context id on success or NULL on
- *	failure.
- */
-void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
-{
-	struct i2o_context_list_element *entry;
-	unsigned long flags;
-	void *ptr = NULL;
-
-	spin_lock_irqsave(&c->context_list_lock, flags);
-	list_for_each_entry(entry, &c->context_list, list)
-	    if (entry->context == context) {
-		list_del(&entry->list);
-		ptr = entry->ptr;
-		kfree(entry);
-		break;
-	}
-	spin_unlock_irqrestore(&c->context_list_lock, flags);
-
-	if (!ptr)
-		osm_warn("%s: context id %d not found\n", c->name, context);
-
-	osm_debug("%s: get ptr from context list %d -> %p\n", c->name, context,
-		  ptr);
-
-	return ptr;
-};
-
-/**
- *      i2o_cntxt_list_get_ptr - Get a context id from the context list
- *	@c: controller to which the context list belong
- *	@ptr: pointer to which the context id should be fetched
- *
- *	Returns context id which matches to the pointer on success or 0 on
- *	failure.
- */
-u32 i2o_cntxt_list_get_ptr(struct i2o_controller * c, void *ptr)
-{
-	struct i2o_context_list_element *entry;
-	u32 context = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&c->context_list_lock, flags);
-	list_for_each_entry(entry, &c->context_list, list)
-	    if (entry->ptr == ptr) {
-		context = entry->context;
-		break;
-	}
-	spin_unlock_irqrestore(&c->context_list_lock, flags);
-
-	if (!context)
-		osm_warn("%s: Could not find nonexistent ptr %p\n", c->name,
-			 ptr);
-
-	osm_debug("%s: get context id from context list %p -> %d\n", c->name,
-		  ptr, context);
-
-	return context;
-};
-#endif
-
-/**
- *	i2o_iop_find - Find an I2O controller by id
- *	@unit: unit number of the I2O controller to search for
- *
- *	Lookup the I2O controller on the controller list.
- *
- *	Returns pointer to the I2O controller on success or NULL if not found.
- */
-struct i2o_controller *i2o_find_iop(int unit)
-{
-	struct i2o_controller *c;
-
-	list_for_each_entry(c, &i2o_controllers, list) {
-		if (c->unit == unit)
-			return c;
-	}
-
-	return NULL;
-};
-
-/**
- *	i2o_iop_find_device - Find a I2O device on an I2O controller
- *	@c: I2O controller where the I2O device hangs on
- *	@tid: TID of the I2O device to search for
- *
- *	Searches the devices of the I2O controller for a device with TID tid and
- *	returns it.
- *
- *	Returns a pointer to the I2O device if found, otherwise NULL.
- */
-struct i2o_device *i2o_iop_find_device(struct i2o_controller *c, u16 tid)
-{
-	struct i2o_device *dev;
-
-	list_for_each_entry(dev, &c->devices, list)
-	    if (dev->lct_data.tid == tid)
-		return dev;
-
-	return NULL;
-};
-
-/**
- *	i2o_quiesce_controller - quiesce controller
- *	@c: controller
- *
- *	Quiesce an IOP. Causes IOP to make external operation quiescent
- *	(i2o 'READY' state). Internal operation of the IOP continues normally.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_iop_quiesce(struct i2o_controller *c)
-{
-	struct i2o_message *msg;
-	i2o_status_block *sb = c->status_block.virt;
-	int rc;
-
-	i2o_status_get(c);
-
-	/* SysQuiesce discarded if IOP not in READY or OPERATIONAL state */
-	if ((sb->iop_state != ADAPTER_STATE_READY) &&
-	    (sb->iop_state != ADAPTER_STATE_OPERATIONAL))
-		return 0;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SYS_QUIESCE << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-
-	/* Long timeout needed for quiesce if lots of devices */
-	if ((rc = i2o_msg_post_wait(c, msg, 240)))
-		osm_info("%s: Unable to quiesce (status=%#x).\n", c->name, -rc);
-	else
-		osm_debug("%s: Quiesced.\n", c->name);
-
-	i2o_status_get(c);	// Entered READY state
-
-	return rc;
-};
-
-/**
- *	i2o_iop_enable - move controller from ready to OPERATIONAL
- *	@c: I2O controller
- *
- *	Enable IOP. This allows the IOP to resume external operations and
- *	reverses the effect of a quiesce. Returns zero or an error code if
- *	an error occurs.
- */
-static int i2o_iop_enable(struct i2o_controller *c)
-{
-	struct i2o_message *msg;
-	i2o_status_block *sb = c->status_block.virt;
-	int rc;
-
-	i2o_status_get(c);
-
-	/* Enable only allowed on READY state */
-	if (sb->iop_state != ADAPTER_STATE_READY)
-		return -EINVAL;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SYS_ENABLE << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-
-	/* How long of a timeout do we need? */
-	if ((rc = i2o_msg_post_wait(c, msg, 240)))
-		osm_err("%s: Could not enable (status=%#x).\n", c->name, -rc);
-	else
-		osm_debug("%s: Enabled.\n", c->name);
-
-	i2o_status_get(c);	// entered OPERATIONAL state
-
-	return rc;
-};
-
-/**
- *	i2o_iop_quiesce_all - Quiesce all I2O controllers on the system
- *
- *	Quiesce all I2O controllers which are connected to the system.
- */
-static inline void i2o_iop_quiesce_all(void)
-{
-	struct i2o_controller *c, *tmp;
-
-	list_for_each_entry_safe(c, tmp, &i2o_controllers, list) {
-		if (!c->no_quiesce)
-			i2o_iop_quiesce(c);
-	}
-};
-
-/**
- *	i2o_iop_enable_all - Enables all controllers on the system
- *
- *	Enables all I2O controllers which are connected to the system.
- */
-static inline void i2o_iop_enable_all(void)
-{
-	struct i2o_controller *c, *tmp;
-
-	list_for_each_entry_safe(c, tmp, &i2o_controllers, list)
-	    i2o_iop_enable(c);
-};
-
-/**
- *	i2o_clear_controller - Bring I2O controller into HOLD state
- *	@c: controller
- *
- *	Clear an IOP to HOLD state, ie. terminate external operations, clear all
- *	input queues and prepare for a system restart. IOP's internal operation
- *	continues normally and the outbound queue is alive. The IOP is not
- *	expected to rebuild its LCT.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_iop_clear(struct i2o_controller *c)
-{
-	struct i2o_message *msg;
-	int rc;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	/* Quiesce all IOPs first */
-	i2o_iop_quiesce_all();
-
-	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_ADAPTER_CLEAR << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-
-	if ((rc = i2o_msg_post_wait(c, msg, 30)))
-		osm_info("%s: Unable to clear (status=%#x).\n", c->name, -rc);
-	else
-		osm_debug("%s: Cleared.\n", c->name);
-
-	/* Enable all IOPs */
-	i2o_iop_enable_all();
-
-	return rc;
-}
-
-/**
- *	i2o_iop_init_outbound_queue - setup the outbound message queue
- *	@c: I2O controller
- *
- *	Clear and (re)initialize IOP's outbound queue and post the message
- *	frames to the IOP.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
-{
-	u32 m;
-	volatile u8 *status = c->status.virt;
-	struct i2o_message *msg;
-	ulong timeout;
-	int i;
-
-	osm_debug("%s: Initializing Outbound Queue...\n", c->name);
-
-	memset(c->status.virt, 0, 4);
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
-	msg->body[0] = cpu_to_le32(PAGE_SIZE);
-	/* Outbound msg frame size in words and Initcode */
-	msg->body[1] = cpu_to_le32(I2O_OUTBOUND_MSG_FRAME_SIZE << 16 | 0x80);
-	msg->body[2] = cpu_to_le32(0xd0000004);
-	msg->body[3] = cpu_to_le32(i2o_dma_low(c->status.phys));
-	msg->body[4] = cpu_to_le32(i2o_dma_high(c->status.phys));
-
-	i2o_msg_post(c, msg);
-
-	timeout = jiffies + I2O_TIMEOUT_INIT_OUTBOUND_QUEUE * HZ;
-	while (*status <= I2O_CMD_IN_PROGRESS) {
-		if (time_after(jiffies, timeout)) {
-			osm_warn("%s: Timeout Initializing\n", c->name);
-			return -ETIMEDOUT;
-		}
-		schedule_timeout_uninterruptible(1);
-	}
-
-	m = c->out_queue.phys;
-
-	/* Post frames */
-	for (i = 0; i < I2O_MAX_OUTBOUND_MSG_FRAMES; i++) {
-		i2o_flush_reply(c, m);
-		udelay(1);	/* Promise */
-		m += I2O_OUTBOUND_MSG_FRAME_SIZE * sizeof(u32);
-	}
-
-	return 0;
-}
-
-/**
- *	i2o_iop_reset - reset an I2O controller
- *	@c: controller to reset
- *
- *	Reset the IOP into INIT state and wait until IOP gets into RESET state.
- *	Terminate all external operations, clear IOP's inbound and outbound
- *	queues, terminate all DDMs, and reload the IOP's operating environment
- *	and all local DDMs. The IOP rebuilds its LCT.
- */
-static int i2o_iop_reset(struct i2o_controller *c)
-{
-	volatile u8 *status = c->status.virt;
-	struct i2o_message *msg;
-	unsigned long timeout;
-	i2o_status_block *sb = c->status_block.virt;
-	int rc = 0;
-
-	osm_debug("%s: Resetting controller\n", c->name);
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	memset(c->status_block.virt, 0, 8);
-
-	/* Quiesce all IOPs first */
-	i2o_iop_quiesce_all();
-
-	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_ADAPTER_RESET << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
-	msg->body[0] = cpu_to_le32(0x00000000);
-	msg->body[1] = cpu_to_le32(0x00000000);
-	msg->body[2] = cpu_to_le32(i2o_dma_low(c->status.phys));
-	msg->body[3] = cpu_to_le32(i2o_dma_high(c->status.phys));
-
-	i2o_msg_post(c, msg);
-
-	/* Wait for a reply */
-	timeout = jiffies + I2O_TIMEOUT_RESET * HZ;
-	while (!*status) {
-		if (time_after(jiffies, timeout))
-			break;
-
-		schedule_timeout_uninterruptible(1);
-	}
-
-	switch (*status) {
-	case I2O_CMD_REJECTED:
-		osm_warn("%s: IOP reset rejected\n", c->name);
-		rc = -EPERM;
-		break;
-
-	case I2O_CMD_IN_PROGRESS:
-		/*
-		 * Once the reset is sent, the IOP goes into the INIT state
-		 * which is indeterminate. We need to wait until the IOP has
-		 * rebooted before we can let the system talk to it. We read
-		 * the inbound Free_List until a message is available. If we
-		 * can't read one in the given amount of time, we assume the
-		 * IOP could not reboot properly.
-		 */
-		osm_debug("%s: Reset in progress, waiting for reboot...\n",
-			  c->name);
-
-		while (IS_ERR(msg = i2o_msg_get_wait(c, I2O_TIMEOUT_RESET))) {
-			if (time_after(jiffies, timeout)) {
-				osm_err("%s: IOP reset timeout.\n", c->name);
-				rc = PTR_ERR(msg);
-				goto exit;
-			}
-			schedule_timeout_uninterruptible(1);
-		}
-		i2o_msg_nop(c, msg);
-
-		/* from here all quiesce commands are safe */
-		c->no_quiesce = 0;
-
-		/* verify if controller is in state RESET */
-		i2o_status_get(c);
-
-		if (!c->promise && (sb->iop_state != ADAPTER_STATE_RESET))
-			osm_warn("%s: reset completed, but adapter not in RESET"
-				 " state.\n", c->name);
-		else
-			osm_debug("%s: reset completed.\n", c->name);
-
-		break;
-
-	default:
-		osm_err("%s: IOP reset timeout.\n", c->name);
-		rc = -ETIMEDOUT;
-		break;
-	}
-
-      exit:
-	/* Enable all IOPs */
-	i2o_iop_enable_all();
-
-	return rc;
-};
-
-/**
- *	i2o_iop_activate - Bring controller up to HOLD
- *	@c: controller
- *
- *	This function brings an I2O controller into HOLD state. The adapter
- *	is reset if necessary and then the queues and resource table are read.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_iop_activate(struct i2o_controller *c)
-{
-	i2o_status_block *sb = c->status_block.virt;
-	int rc;
-	int state;
-
-	/* In INIT state, Wait Inbound Q to initialize (in i2o_status_get) */
-	/* In READY state, Get status */
-
-	rc = i2o_status_get(c);
-	if (rc) {
-		osm_info("%s: Unable to obtain status, attempting a reset.\n",
-			 c->name);
-		rc = i2o_iop_reset(c);
-		if (rc)
-			return rc;
-	}
-
-	if (sb->i2o_version > I2OVER15) {
-		osm_err("%s: Not running version 1.5 of the I2O Specification."
-			"\n", c->name);
-		return -ENODEV;
-	}
-
-	switch (sb->iop_state) {
-	case ADAPTER_STATE_FAULTED:
-		osm_err("%s: hardware fault\n", c->name);
-		return -EFAULT;
-
-	case ADAPTER_STATE_READY:
-	case ADAPTER_STATE_OPERATIONAL:
-	case ADAPTER_STATE_HOLD:
-	case ADAPTER_STATE_FAILED:
-		osm_debug("%s: already running, trying to reset...\n", c->name);
-		rc = i2o_iop_reset(c);
-		if (rc)
-			return rc;
-	}
-
-	/* preserve state */
-	state = sb->iop_state;
-
-	rc = i2o_iop_init_outbound_queue(c);
-	if (rc)
-		return rc;
-
-	/* if adapter was not in RESET state clear now */
-	if (state != ADAPTER_STATE_RESET)
-		i2o_iop_clear(c);
-
-	i2o_status_get(c);
-
-	if (sb->iop_state != ADAPTER_STATE_HOLD) {
-		osm_err("%s: failed to bring IOP into HOLD state\n", c->name);
-		return -EIO;
-	}
-
-	return i2o_hrt_get(c);
-};
-
-static void i2o_res_alloc(struct i2o_controller *c, unsigned long flags)
-{
-	i2o_status_block *sb = c->status_block.virt;
-	struct resource *res = &c->mem_resource;
-	resource_size_t size, align;
-	int err;
-
-	res->name = c->pdev->bus->name;
-	res->flags = flags;
-	res->start = 0;
-	res->end = 0;
-	osm_info("%s: requires private memory resources.\n", c->name);
-
-	if (flags & IORESOURCE_MEM) {
-		size = sb->desired_mem_size;
-		align = 1 << 20;	/* unspecified, use 1Mb and play safe */
-	} else {
-		size = sb->desired_io_size;
-		align = 1 << 12;	/* unspecified, use 4Kb and play safe */
-	}
-
-	err = pci_bus_alloc_resource(c->pdev->bus, res, size, align, 0, 0,
-				     NULL, NULL);
-	if (err < 0)
-		return;
-
-	if (flags & IORESOURCE_MEM) {
-		c->mem_alloc = 1;
-		sb->current_mem_size = resource_size(res);
-		sb->current_mem_base = res->start;
-	} else if (flags & IORESOURCE_IO) {
-		c->io_alloc = 1;
-		sb->current_io_size = resource_size(res);
-		sb->current_io_base = res->start;
-	}
-	osm_info("%s: allocated PCI space %pR\n", c->name, res);
-}
-
-/**
- *	i2o_iop_systab_set - Set the I2O System Table of the specified IOP
- *	@c: I2O controller to which the system table should be send
- *
- *	Before the systab could be set i2o_systab_build() must be called.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_iop_systab_set(struct i2o_controller *c)
-{
-	struct i2o_message *msg;
-	i2o_status_block *sb = c->status_block.virt;
-	struct device *dev = &c->pdev->dev;
-	int rc;
-
-	if (sb->current_mem_size < sb->desired_mem_size)
-		i2o_res_alloc(c, IORESOURCE_MEM);
-
-	if (sb->current_io_size < sb->desired_io_size)
-		i2o_res_alloc(c, IORESOURCE_IO);
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	i2o_systab.phys = dma_map_single(dev, i2o_systab.virt, i2o_systab.len,
-					 PCI_DMA_TODEVICE);
-	if (!i2o_systab.phys) {
-		i2o_msg_nop(c, msg);
-		return -ENOMEM;
-	}
-
-	msg->u.head[0] = cpu_to_le32(I2O_MESSAGE_SIZE(12) | SGL_OFFSET_6);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SYS_TAB_SET << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-
-	/*
-	 * Provide three SGL-elements:
-	 * System table (SysTab), Private memory space declaration and
-	 * Private i/o space declaration
-	 */
-
-	msg->body[0] = cpu_to_le32(c->unit + 2);
-	msg->body[1] = cpu_to_le32(0x00000000);
-	msg->body[2] = cpu_to_le32(0x54000000 | i2o_systab.len);
-	msg->body[3] = cpu_to_le32(i2o_systab.phys);
-	msg->body[4] = cpu_to_le32(0x54000000 | sb->current_mem_size);
-	msg->body[5] = cpu_to_le32(sb->current_mem_base);
-	msg->body[6] = cpu_to_le32(0xd4000000 | sb->current_io_size);
-	msg->body[6] = cpu_to_le32(sb->current_io_base);
-
-	rc = i2o_msg_post_wait(c, msg, 120);
-
-	dma_unmap_single(dev, i2o_systab.phys, i2o_systab.len,
-			 PCI_DMA_TODEVICE);
-
-	if (rc < 0)
-		osm_err("%s: Unable to set SysTab (status=%#x).\n", c->name,
-			-rc);
-	else
-		osm_debug("%s: SysTab set.\n", c->name);
-
-	return rc;
-}
-
-/**
- *	i2o_iop_online - Bring a controller online into OPERATIONAL state.
- *	@c: I2O controller
- *
- *	Send the system table and enable the I2O controller.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_iop_online(struct i2o_controller *c)
-{
-	int rc;
-
-	rc = i2o_iop_systab_set(c);
-	if (rc)
-		return rc;
-
-	/* In READY state */
-	osm_debug("%s: Attempting to enable...\n", c->name);
-	rc = i2o_iop_enable(c);
-	if (rc)
-		return rc;
-
-	return 0;
-};
-
-/**
- *	i2o_iop_remove - Remove the I2O controller from the I2O core
- *	@c: I2O controller
- *
- *	Remove the I2O controller from the I2O core. If devices are attached to
- *	the controller remove these also and finally reset the controller.
- */
-void i2o_iop_remove(struct i2o_controller *c)
-{
-	struct i2o_device *dev, *tmp;
-
-	osm_debug("%s: deleting controller\n", c->name);
-
-	i2o_driver_notify_controller_remove_all(c);
-
-	list_del(&c->list);
-
-	list_for_each_entry_safe(dev, tmp, &c->devices, list)
-	    i2o_device_remove(dev);
-
-	device_del(&c->device);
-
-	/* Ask the IOP to switch to RESET state */
-	i2o_iop_reset(c);
-}
-
-/**
- *	i2o_systab_build - Build system table
- *
- *	The system table contains information about all the IOPs in the system
- *	(duh) and is used by the Executives on the IOPs to establish peer2peer
- *	connections. We're not supporting peer2peer at the moment, but this
- *	will be needed down the road for things like lan2lan forwarding.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_systab_build(void)
-{
-	struct i2o_controller *c, *tmp;
-	int num_controllers = 0;
-	u32 change_ind = 0;
-	int count = 0;
-	struct i2o_sys_tbl *systab = i2o_systab.virt;
-
-	list_for_each_entry_safe(c, tmp, &i2o_controllers, list)
-	    num_controllers++;
-
-	if (systab) {
-		change_ind = systab->change_ind;
-		kfree(i2o_systab.virt);
-	}
-
-	/* Header + IOPs */
-	i2o_systab.len = sizeof(struct i2o_sys_tbl) + num_controllers *
-	    sizeof(struct i2o_sys_tbl_entry);
-
-	systab = i2o_systab.virt = kzalloc(i2o_systab.len, GFP_KERNEL);
-	if (!systab) {
-		osm_err("unable to allocate memory for System Table\n");
-		return -ENOMEM;
-	}
-
-	systab->version = I2OVERSION;
-	systab->change_ind = change_ind + 1;
-
-	list_for_each_entry_safe(c, tmp, &i2o_controllers, list) {
-		i2o_status_block *sb;
-
-		if (count >= num_controllers) {
-			osm_err("controller added while building system table"
-				"\n");
-			break;
-		}
-
-		sb = c->status_block.virt;
-
-		/*
-		 * Get updated IOP state so we have the latest information
-		 *
-		 * We should delete the controller at this point if it
-		 * doesn't respond since if it's not on the system table
-		 * it is techninically not part of the I2O subsystem...
-		 */
-		if (unlikely(i2o_status_get(c))) {
-			osm_err("%s: Deleting b/c could not get status while "
-				"attempting to build system table\n", c->name);
-			i2o_iop_remove(c);
-			continue;	// try the next one
-		}
-
-		systab->iops[count].org_id = sb->org_id;
-		systab->iops[count].iop_id = c->unit + 2;
-		systab->iops[count].seg_num = 0;
-		systab->iops[count].i2o_version = sb->i2o_version;
-		systab->iops[count].iop_state = sb->iop_state;
-		systab->iops[count].msg_type = sb->msg_type;
-		systab->iops[count].frame_size = sb->inbound_frame_size;
-		systab->iops[count].last_changed = change_ind;
-		systab->iops[count].iop_capabilities = sb->iop_capabilities;
-		systab->iops[count].inbound_low =
-		    i2o_dma_low(c->base.phys + I2O_IN_PORT);
-		systab->iops[count].inbound_high =
-		    i2o_dma_high(c->base.phys + I2O_IN_PORT);
-
-		count++;
-	}
-
-	systab->num_entries = count;
-
-	return 0;
-};
-
-/**
- *	i2o_parse_hrt - Parse the hardware resource table.
- *	@c: I2O controller
- *
- *	We don't do anything with it except dumping it (in debug mode).
- *
- *	Returns 0.
- */
-static int i2o_parse_hrt(struct i2o_controller *c)
-{
-	i2o_dump_hrt(c);
-	return 0;
-};
-
-/**
- *	i2o_status_get - Get the status block from the I2O controller
- *	@c: I2O controller
- *
- *	Issue a status query on the controller. This updates the attached
- *	status block. The status block could then be accessed through
- *	c->status_block.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_status_get(struct i2o_controller *c)
-{
-	struct i2o_message *msg;
-	volatile u8 *status_block;
-	unsigned long timeout;
-
-	status_block = (u8 *) c->status_block.virt;
-	memset(c->status_block.virt, 0, sizeof(i2o_status_block));
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_STATUS_GET << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
-	msg->body[0] = cpu_to_le32(0x00000000);
-	msg->body[1] = cpu_to_le32(0x00000000);
-	msg->body[2] = cpu_to_le32(i2o_dma_low(c->status_block.phys));
-	msg->body[3] = cpu_to_le32(i2o_dma_high(c->status_block.phys));
-	msg->body[4] = cpu_to_le32(sizeof(i2o_status_block));	/* always 88 bytes */
-
-	i2o_msg_post(c, msg);
-
-	/* Wait for a reply */
-	timeout = jiffies + I2O_TIMEOUT_STATUS_GET * HZ;
-	while (status_block[87] != 0xFF) {
-		if (time_after(jiffies, timeout)) {
-			osm_err("%s: Get status timeout.\n", c->name);
-			return -ETIMEDOUT;
-		}
-
-		schedule_timeout_uninterruptible(1);
-	}
-
-#ifdef DEBUG
-	i2o_debug_state(c);
-#endif
-
-	return 0;
-}
-
-/*
- *	i2o_hrt_get - Get the Hardware Resource Table from the I2O controller
- *	@c: I2O controller from which the HRT should be fetched
- *
- *	The HRT contains information about possible hidden devices but is
- *	mostly useless to us.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_hrt_get(struct i2o_controller *c)
-{
-	int rc;
-	int i;
-	i2o_hrt *hrt = c->hrt.virt;
-	u32 size = sizeof(i2o_hrt);
-	struct device *dev = &c->pdev->dev;
-
-	for (i = 0; i < I2O_HRT_GET_TRIES; i++) {
-		struct i2o_message *msg;
-
-		msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-		if (IS_ERR(msg))
-			return PTR_ERR(msg);
-
-		msg->u.head[0] = cpu_to_le32(SIX_WORD_MSG_SIZE | SGL_OFFSET_4);
-		msg->u.head[1] =
-		    cpu_to_le32(I2O_CMD_HRT_GET << 24 | HOST_TID << 12 |
-				ADAPTER_TID);
-		msg->body[0] = cpu_to_le32(0xd0000000 | c->hrt.len);
-		msg->body[1] = cpu_to_le32(c->hrt.phys);
-
-		rc = i2o_msg_post_wait_mem(c, msg, 20, &c->hrt);
-
-		if (rc < 0) {
-			osm_err("%s: Unable to get HRT (status=%#x)\n", c->name,
-				-rc);
-			return rc;
-		}
-
-		size = hrt->num_entries * hrt->entry_len << 2;
-		if (size > c->hrt.len) {
-			if (i2o_dma_realloc(dev, &c->hrt, size))
-				return -ENOMEM;
-			else
-				hrt = c->hrt.virt;
-		} else
-			return i2o_parse_hrt(c);
-	}
-
-	osm_err("%s: Unable to get HRT after %d tries, giving up\n", c->name,
-		I2O_HRT_GET_TRIES);
-
-	return -EBUSY;
-}
-
-/**
- *	i2o_iop_release - release the memory for a I2O controller
- *	@dev: I2O controller which should be released
- *
- *	Release the allocated memory. This function is called if refcount of
- *	device reaches 0 automatically.
- */
-static void i2o_iop_release(struct device *dev)
-{
-	struct i2o_controller *c = to_i2o_controller(dev);
-
-	i2o_iop_free(c);
-};
-
-/**
- *	i2o_iop_alloc - Allocate and initialize a i2o_controller struct
- *
- *	Allocate the necessary memory for a i2o_controller struct and
- *	initialize the lists and message mempool.
- *
- *	Returns a pointer to the I2O controller or a negative error code on
- *	failure.
- */
-struct i2o_controller *i2o_iop_alloc(void)
-{
-	static int unit = 0;	/* 0 and 1 are NULL IOP and Local Host */
-	struct i2o_controller *c;
-	char poolname[32];
-
-	c = kzalloc(sizeof(*c), GFP_KERNEL);
-	if (!c) {
-		osm_err("i2o: Insufficient memory to allocate a I2O controller."
-			"\n");
-		return ERR_PTR(-ENOMEM);
-	}
-
-	c->unit = unit++;
-	sprintf(c->name, "iop%d", c->unit);
-
-	snprintf(poolname, sizeof(poolname), "i2o_%s_msg_inpool", c->name);
-	if (i2o_pool_alloc
-	    (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4 + sizeof(u32),
-	     I2O_MSG_INPOOL_MIN)) {
-		kfree(c);
-		return ERR_PTR(-ENOMEM);
-	};
-
-	INIT_LIST_HEAD(&c->devices);
-	spin_lock_init(&c->lock);
-	mutex_init(&c->lct_lock);
-
-	device_initialize(&c->device);
-
-	c->device.release = &i2o_iop_release;
-
-	dev_set_name(&c->device, "iop%d", c->unit);
-
-#if BITS_PER_LONG == 64
-	spin_lock_init(&c->context_list_lock);
-	atomic_set(&c->context_list_counter, 0);
-	INIT_LIST_HEAD(&c->context_list);
-#endif
-
-	return c;
-};
-
-/**
- *	i2o_iop_add - Initialize the I2O controller and add him to the I2O core
- *	@c: controller
- *
- *	Initialize the I2O controller and if no error occurs add him to the I2O
- *	core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_iop_add(struct i2o_controller *c)
-{
-	int rc;
-
-	if ((rc = device_add(&c->device))) {
-		osm_err("%s: could not add controller\n", c->name);
-		goto iop_reset;
-	}
-
-	osm_info("%s: Activating I2O controller...\n", c->name);
-	osm_info("%s: This may take a few minutes if there are many devices\n",
-		 c->name);
-
-	if ((rc = i2o_iop_activate(c))) {
-		osm_err("%s: could not activate controller\n", c->name);
-		goto device_del;
-	}
-
-	osm_debug("%s: building sys table...\n", c->name);
-
-	if ((rc = i2o_systab_build()))
-		goto device_del;
-
-	osm_debug("%s: online controller...\n", c->name);
-
-	if ((rc = i2o_iop_online(c)))
-		goto device_del;
-
-	osm_debug("%s: getting LCT...\n", c->name);
-
-	if ((rc = i2o_exec_lct_get(c)))
-		goto device_del;
-
-	list_add(&c->list, &i2o_controllers);
-
-	i2o_driver_notify_controller_add_all(c);
-
-	osm_info("%s: Controller added\n", c->name);
-
-	return 0;
-
-      device_del:
-	device_del(&c->device);
-
-      iop_reset:
-	i2o_iop_reset(c);
-
-	return rc;
-};
-
-/**
- *	i2o_event_register - Turn on/off event notification for a I2O device
- *	@dev: I2O device which should receive the event registration request
- *	@drv: driver which want to get notified
- *	@tcntxt: transaction context to use with this notifier
- *	@evt_mask: mask of events
- *
- *	Create and posts an event registration message to the task. No reply
- *	is waited for, or expected. If you do not want further notifications,
- *	call the i2o_event_register again with a evt_mask of 0.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_event_register(struct i2o_device *dev, struct i2o_driver *drv,
-		       int tcntxt, u32 evt_mask)
-{
-	struct i2o_controller *c = dev->iop;
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 | dev->
-			lct_data.tid);
-	msg->u.s.icntxt = cpu_to_le32(drv->context);
-	msg->u.s.tcntxt = cpu_to_le32(tcntxt);
-	msg->body[0] = cpu_to_le32(evt_mask);
-
-	i2o_msg_post(c, msg);
-
-	return 0;
-};
-
-/**
- *	i2o_iop_init - I2O main initialization function
- *
- *	Initialize the I2O drivers (OSM) functions, register the Executive OSM,
- *	initialize the I2O PCI part and finally initialize I2O device stuff.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_iop_init(void)
-{
-	int rc = 0;
-
-	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
-
-	if ((rc = i2o_driver_init()))
-		goto exit;
-
-	if ((rc = i2o_exec_init()))
-		goto driver_exit;
-
-	if ((rc = i2o_pci_init()))
-		goto exec_exit;
-
-	return 0;
-
-      exec_exit:
-	i2o_exec_exit();
-
-      driver_exit:
-	i2o_driver_exit();
-
-      exit:
-	return rc;
-}
-
-/**
- *	i2o_iop_exit - I2O main exit function
- *
- *	Removes I2O controllers from PCI subsystem and shut down OSMs.
- */
-static void __exit i2o_iop_exit(void)
-{
-	i2o_pci_exit();
-	i2o_exec_exit();
-	i2o_driver_exit();
-};
-
-module_init(i2o_iop_init);
-module_exit(i2o_iop_exit);
-
-MODULE_AUTHOR("Red Hat Software");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(OSM_DESCRIPTION);
-MODULE_VERSION(OSM_VERSION);
-
-#if BITS_PER_LONG == 64
-EXPORT_SYMBOL(i2o_cntxt_list_add);
-EXPORT_SYMBOL(i2o_cntxt_list_get);
-EXPORT_SYMBOL(i2o_cntxt_list_remove);
-EXPORT_SYMBOL(i2o_cntxt_list_get_ptr);
-#endif
-EXPORT_SYMBOL(i2o_msg_get_wait);
-EXPORT_SYMBOL(i2o_find_iop);
-EXPORT_SYMBOL(i2o_iop_find_device);
-EXPORT_SYMBOL(i2o_event_register);
-EXPORT_SYMBOL(i2o_status_get);
-EXPORT_SYMBOL(i2o_controllers);
diff --git a/drivers/message/i2o/memory.c b/drivers/message/i2o/memory.c
deleted file mode 100644
index 292b41e49fbd..000000000000
--- a/drivers/message/i2o/memory.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- *	Functions to handle I2O memory
- *
- *	Pulled from the inlines in i2o headers and uninlined
- *
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/i2o.h>
-#include <linux/delay.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include "core.h"
-
-/* Protects our 32/64bit mask switching */
-static DEFINE_MUTEX(mem_lock);
-
-/**
- *	i2o_sg_tablesize - Calculate the maximum number of elements in a SGL
- *	@c: I2O controller for which the calculation should be done
- *	@body_size: maximum body size used for message in 32-bit words.
- *
- *	Return the maximum number of SG elements in a SG list.
- */
-u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size)
-{
-	i2o_status_block *sb = c->status_block.virt;
-	u16 sg_count =
-	    (sb->inbound_frame_size - sizeof(struct i2o_message) / 4) -
-	    body_size;
-
-	if (c->pae_support) {
-		/*
-		 * for 64-bit a SG attribute element must be added and each
-		 * SG element needs 12 bytes instead of 8.
-		 */
-		sg_count -= 2;
-		sg_count /= 3;
-	} else
-		sg_count /= 2;
-
-	if (c->short_req && (sg_count > 8))
-		sg_count = 8;
-
-	return sg_count;
-}
-EXPORT_SYMBOL_GPL(i2o_sg_tablesize);
-
-
-/**
- *	i2o_dma_map_single - Map pointer to controller and fill in I2O message.
- *	@c: I2O controller
- *	@ptr: pointer to the data which should be mapped
- *	@size: size of data in bytes
- *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
- *	@sg_ptr: pointer to the SG list inside the I2O message
- *
- *	This function does all necessary DMA handling and also writes the I2O
- *	SGL elements into the I2O message. For details on DMA handling see also
- *	dma_map_single(). The pointer sg_ptr will only be set to the end of the
- *	SG list if the allocation was successful.
- *
- *	Returns DMA address which must be checked for failures using
- *	dma_mapping_error().
- */
-dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
-					    size_t size,
-					    enum dma_data_direction direction,
-					    u32 ** sg_ptr)
-{
-	u32 sg_flags;
-	u32 *mptr = *sg_ptr;
-	dma_addr_t dma_addr;
-
-	switch (direction) {
-	case DMA_TO_DEVICE:
-		sg_flags = 0xd4000000;
-		break;
-	case DMA_FROM_DEVICE:
-		sg_flags = 0xd0000000;
-		break;
-	default:
-		return 0;
-	}
-
-	dma_addr = dma_map_single(&c->pdev->dev, ptr, size, direction);
-	if (!dma_mapping_error(&c->pdev->dev, dma_addr)) {
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
-			*mptr++ = cpu_to_le32(0x7C020002);
-			*mptr++ = cpu_to_le32(PAGE_SIZE);
-		}
-#endif
-
-		*mptr++ = cpu_to_le32(sg_flags | size);
-		*mptr++ = cpu_to_le32(i2o_dma_low(dma_addr));
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
-			*mptr++ = cpu_to_le32(i2o_dma_high(dma_addr));
-#endif
-		*sg_ptr = mptr;
-	}
-	return dma_addr;
-}
-EXPORT_SYMBOL_GPL(i2o_dma_map_single);
-
-/**
- *	i2o_dma_map_sg - Map a SG List to controller and fill in I2O message.
- *	@c: I2O controller
- *	@sg: SG list to be mapped
- *	@sg_count: number of elements in the SG list
- *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
- *	@sg_ptr: pointer to the SG list inside the I2O message
- *
- *	This function does all necessary DMA handling and also writes the I2O
- *	SGL elements into the I2O message. For details on DMA handling see also
- *	dma_map_sg(). The pointer sg_ptr will only be set to the end of the SG
- *	list if the allocation was successful.
- *
- *	Returns 0 on failure or 1 on success.
- */
-int i2o_dma_map_sg(struct i2o_controller *c, struct scatterlist *sg,
-	    int sg_count, enum dma_data_direction direction, u32 ** sg_ptr)
-{
-	u32 sg_flags;
-	u32 *mptr = *sg_ptr;
-
-	switch (direction) {
-	case DMA_TO_DEVICE:
-		sg_flags = 0x14000000;
-		break;
-	case DMA_FROM_DEVICE:
-		sg_flags = 0x10000000;
-		break;
-	default:
-		return 0;
-	}
-
-	sg_count = dma_map_sg(&c->pdev->dev, sg, sg_count, direction);
-	if (!sg_count)
-		return 0;
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-	if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
-		*mptr++ = cpu_to_le32(0x7C020002);
-		*mptr++ = cpu_to_le32(PAGE_SIZE);
-	}
-#endif
-
-	while (sg_count-- > 0) {
-		if (!sg_count)
-			sg_flags |= 0xC0000000;
-		*mptr++ = cpu_to_le32(sg_flags | sg_dma_len(sg));
-		*mptr++ = cpu_to_le32(i2o_dma_low(sg_dma_address(sg)));
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
-			*mptr++ = cpu_to_le32(i2o_dma_high(sg_dma_address(sg)));
-#endif
-		sg = sg_next(sg);
-	}
-	*sg_ptr = mptr;
-
-	return 1;
-}
-EXPORT_SYMBOL_GPL(i2o_dma_map_sg);
-
-/**
- *	i2o_dma_alloc - Allocate DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which should get the DMA buffer
- *	@len: length of the new DMA memory
- *
- *	Allocate a coherent DMA memory and write the pointers into addr.
- *
- *	Returns 0 on success or -ENOMEM on failure.
- */
-int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr, size_t len)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	int dma_64 = 0;
-
-	mutex_lock(&mem_lock);
-	if ((sizeof(dma_addr_t) > 4) && (pdev->dma_mask == DMA_BIT_MASK(64))) {
-		dma_64 = 1;
-		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
-			mutex_unlock(&mem_lock);
-			return -ENOMEM;
-		}
-	}
-
-	addr->virt = dma_alloc_coherent(dev, len, &addr->phys, GFP_KERNEL);
-
-	if ((sizeof(dma_addr_t) > 4) && dma_64)
-		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)))
-			printk(KERN_WARNING "i2o: unable to set 64-bit DMA");
-	mutex_unlock(&mem_lock);
-
-	if (!addr->virt)
-		return -ENOMEM;
-
-	memset(addr->virt, 0, len);
-	addr->len = len;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(i2o_dma_alloc);
-
-
-/**
- *	i2o_dma_free - Free DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which contains the DMA buffer
- *
- *	Free a coherent DMA memory and set virtual address of addr to NULL.
- */
-void i2o_dma_free(struct device *dev, struct i2o_dma *addr)
-{
-	if (addr->virt) {
-		if (addr->phys)
-			dma_free_coherent(dev, addr->len, addr->virt,
-					  addr->phys);
-		else
-			kfree(addr->virt);
-		addr->virt = NULL;
-	}
-}
-EXPORT_SYMBOL_GPL(i2o_dma_free);
-
-
-/**
- *	i2o_dma_realloc - Realloc DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: pointer to a i2o_dma struct DMA buffer
- *	@len: new length of memory
- *
- *	If there was something allocated in the addr, free it first. If len > 0
- *	than try to allocate it and write the addresses back to the addr
- *	structure. If len == 0 set the virtual address to NULL.
- *
- *	Returns the 0 on success or negative error code on failure.
- */
-int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr, size_t len)
-{
-	i2o_dma_free(dev, addr);
-
-	if (len)
-		return i2o_dma_alloc(dev, addr, len);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(i2o_dma_realloc);
-
-/*
- *	i2o_pool_alloc - Allocate an slab cache and mempool
- *	@mempool: pointer to struct i2o_pool to write data into.
- *	@name: name which is used to identify cache
- *	@size: size of each object
- *	@min_nr: minimum number of objects
- *
- *	First allocates a slab cache with name and size. Then allocates a
- *	mempool which uses the slab cache for allocation and freeing.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
-				 size_t size, int min_nr)
-{
-	pool->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
-	if (!pool->name)
-		goto exit;
-	strcpy(pool->name, name);
-
-	pool->slab =
-	    kmem_cache_create(pool->name, size, 0, SLAB_HWCACHE_ALIGN, NULL);
-	if (!pool->slab)
-		goto free_name;
-
-	pool->mempool = mempool_create_slab_pool(min_nr, pool->slab);
-	if (!pool->mempool)
-		goto free_slab;
-
-	return 0;
-
-free_slab:
-	kmem_cache_destroy(pool->slab);
-
-free_name:
-	kfree(pool->name);
-
-exit:
-	return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(i2o_pool_alloc);
-
-/*
- *	i2o_pool_free - Free slab cache and mempool again
- *	@mempool: pointer to struct i2o_pool which should be freed
- *
- *	Note that you have to return all objects to the mempool again before
- *	calling i2o_pool_free().
- */
-void i2o_pool_free(struct i2o_pool *pool)
-{
-	mempool_destroy(pool->mempool);
-	kmem_cache_destroy(pool->slab);
-	kfree(pool->name);
-};
-EXPORT_SYMBOL_GPL(i2o_pool_free);
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
deleted file mode 100644
index 0f9f3e1a2b6b..000000000000
--- a/drivers/message/i2o/pci.c
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- *	PCI handling of I2O controller
- *
- * 	Copyright (C) 1999-2002	Red Hat Software
- *
- *	Written by Alan Cox, Building Number Three Ltd
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	A lot of the I2O message side code from this is taken from the Red
- *	Creek RCPCI45 adapter driver by Red Creek Communications
- *
- *	Fixes/additions:
- *		Philipp Rumpf
- *		Juha Sievänen <Juha.Sievanen@cs.Helsinki.FI>
- *		Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
- *		Deepak Saxena <deepak@plexity.net>
- *		Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
- *		Alan Cox <alan@lxorguk.ukuu.org.uk>:
- *			Ported to Linux 2.5.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Minor fixes for 2.6.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Support for sysfs included.
- */
-
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/i2o.h>
-#include <linux/module.h>
-#include "core.h"
-
-#define OSM_DESCRIPTION	"I2O-subsystem"
-
-/* PCI device id table for all I2O controllers */
-static struct pci_device_id i2o_pci_ids[] = {
-	{PCI_DEVICE_CLASS(PCI_CLASS_INTELLIGENT_I2O << 8, 0xffff00)},
-	{PCI_DEVICE(PCI_VENDOR_ID_DPT, 0xa511)},
-	{.vendor = PCI_VENDOR_ID_INTEL,.device = 0x1962,
-	 .subvendor = PCI_VENDOR_ID_PROMISE,.subdevice = PCI_ANY_ID},
-	{0}
-};
-
-/**
- *	i2o_pci_free - Frees the DMA memory for the I2O controller
- *	@c: I2O controller to free
- *
- *	Remove all allocated DMA memory and unmap memory IO regions. If MTRR
- *	is enabled, also remove it again.
- */
-static void i2o_pci_free(struct i2o_controller *c)
-{
-	struct device *dev;
-
-	dev = &c->pdev->dev;
-
-	i2o_dma_free(dev, &c->out_queue);
-	i2o_dma_free(dev, &c->status_block);
-	kfree(c->lct);
-	i2o_dma_free(dev, &c->dlct);
-	i2o_dma_free(dev, &c->hrt);
-	i2o_dma_free(dev, &c->status);
-
-	if (c->raptor && c->in_queue.virt)
-		iounmap(c->in_queue.virt);
-
-	if (c->base.virt)
-		iounmap(c->base.virt);
-
-	pci_release_regions(c->pdev);
-}
-
-/**
- *	i2o_pci_alloc - Allocate DMA memory, map IO memory for I2O controller
- *	@c: I2O controller
- *
- *	Allocate DMA memory for a PCI (or in theory AGP) I2O controller. All
- *	IO mappings are also done here. If MTRR is enabled, also do add memory
- *	regions here.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_pci_alloc(struct i2o_controller *c)
-{
-	struct pci_dev *pdev = c->pdev;
-	struct device *dev = &pdev->dev;
-	int i;
-
-	if (pci_request_regions(pdev, OSM_DESCRIPTION)) {
-		printk(KERN_ERR "%s: device already claimed\n", c->name);
-		return -ENODEV;
-	}
-
-	for (i = 0; i < 6; i++) {
-		/* Skip I/O spaces */
-		if (!(pci_resource_flags(pdev, i) & IORESOURCE_IO)) {
-			if (!c->base.phys) {
-				c->base.phys = pci_resource_start(pdev, i);
-				c->base.len = pci_resource_len(pdev, i);
-
-				/*
-				 * If we know what card it is, set the size
-				 * correctly. Code is taken from dpt_i2o.c
-				 */
-				if (pdev->device == 0xa501) {
-					if (pdev->subsystem_device >= 0xc032 &&
-					    pdev->subsystem_device <= 0xc03b) {
-						if (c->base.len > 0x400000)
-							c->base.len = 0x400000;
-					} else {
-						if (c->base.len > 0x100000)
-							c->base.len = 0x100000;
-					}
-				}
-				if (!c->raptor)
-					break;
-			} else {
-				c->in_queue.phys = pci_resource_start(pdev, i);
-				c->in_queue.len = pci_resource_len(pdev, i);
-				break;
-			}
-		}
-	}
-
-	if (i == 6) {
-		printk(KERN_ERR "%s: I2O controller has no memory regions"
-		       " defined.\n", c->name);
-		i2o_pci_free(c);
-		return -EINVAL;
-	}
-
-	/* Map the I2O controller */
-	if (c->raptor) {
-		printk(KERN_INFO "%s: PCI I2O controller\n", c->name);
-		printk(KERN_INFO "     BAR0 at 0x%08lX size=%ld\n",
-		       (unsigned long)c->base.phys, (unsigned long)c->base.len);
-		printk(KERN_INFO "     BAR1 at 0x%08lX size=%ld\n",
-		       (unsigned long)c->in_queue.phys,
-		       (unsigned long)c->in_queue.len);
-	} else
-		printk(KERN_INFO "%s: PCI I2O controller at %08lX size=%ld\n",
-		       c->name, (unsigned long)c->base.phys,
-		       (unsigned long)c->base.len);
-
-	c->base.virt = ioremap_nocache(c->base.phys, c->base.len);
-	if (!c->base.virt) {
-		printk(KERN_ERR "%s: Unable to map controller.\n", c->name);
-		i2o_pci_free(c);
-		return -ENOMEM;
-	}
-
-	if (c->raptor) {
-		c->in_queue.virt =
-		    ioremap_nocache(c->in_queue.phys, c->in_queue.len);
-		if (!c->in_queue.virt) {
-			printk(KERN_ERR "%s: Unable to map controller.\n",
-			       c->name);
-			i2o_pci_free(c);
-			return -ENOMEM;
-		}
-	} else
-		c->in_queue = c->base;
-
-	c->irq_status = c->base.virt + I2O_IRQ_STATUS;
-	c->irq_mask = c->base.virt + I2O_IRQ_MASK;
-	c->in_port = c->base.virt + I2O_IN_PORT;
-	c->out_port = c->base.virt + I2O_OUT_PORT;
-
-	/* Motorola/Freescale chip does not follow spec */
-	if (pdev->vendor == PCI_VENDOR_ID_MOTOROLA && pdev->device == 0x18c0) {
-		/* Check if CPU is enabled */
-		if (be32_to_cpu(readl(c->base.virt + 0x10000)) & 0x10000000) {
-			printk(KERN_INFO "%s: MPC82XX needs CPU running to "
-			       "service I2O.\n", c->name);
-			i2o_pci_free(c);
-			return -ENODEV;
-		} else {
-			c->irq_status += I2O_MOTOROLA_PORT_OFFSET;
-			c->irq_mask += I2O_MOTOROLA_PORT_OFFSET;
-			c->in_port += I2O_MOTOROLA_PORT_OFFSET;
-			c->out_port += I2O_MOTOROLA_PORT_OFFSET;
-			printk(KERN_INFO "%s: MPC82XX workarounds activated.\n",
-			       c->name);
-		}
-	}
-
-	if (i2o_dma_alloc(dev, &c->status, 8)) {
-		i2o_pci_free(c);
-		return -ENOMEM;
-	}
-
-	if (i2o_dma_alloc(dev, &c->hrt, sizeof(i2o_hrt))) {
-		i2o_pci_free(c);
-		return -ENOMEM;
-	}
-
-	if (i2o_dma_alloc(dev, &c->dlct, 8192)) {
-		i2o_pci_free(c);
-		return -ENOMEM;
-	}
-
-	if (i2o_dma_alloc(dev, &c->status_block, sizeof(i2o_status_block))) {
-		i2o_pci_free(c);
-		return -ENOMEM;
-	}
-
-	if (i2o_dma_alloc(dev, &c->out_queue,
-		I2O_MAX_OUTBOUND_MSG_FRAMES * I2O_OUTBOUND_MSG_FRAME_SIZE *
-				sizeof(u32))) {
-		i2o_pci_free(c);
-		return -ENOMEM;
-	}
-
-	pci_set_drvdata(pdev, c);
-
-	return 0;
-}
-
-/**
- *	i2o_pci_interrupt - Interrupt handler for I2O controller
- *	@irq: interrupt line
- *	@dev_id: pointer to the I2O controller
- *
- *	Handle an interrupt from a PCI based I2O controller. This turns out
- *	to be rather simple. We keep the controller pointer in the cookie.
- */
-static irqreturn_t i2o_pci_interrupt(int irq, void *dev_id)
-{
-	struct i2o_controller *c = dev_id;
-	u32 m;
-	irqreturn_t rc = IRQ_NONE;
-
-	while (readl(c->irq_status) & I2O_IRQ_OUTBOUND_POST) {
-		m = readl(c->out_port);
-		if (m == I2O_QUEUE_EMPTY) {
-			/*
-			 * Old 960 steppings had a bug in the I2O unit that
-			 * caused the queue to appear empty when it wasn't.
-			 */
-			m = readl(c->out_port);
-			if (unlikely(m == I2O_QUEUE_EMPTY))
-				break;
-		}
-
-		/* dispatch it */
-		if (i2o_driver_dispatch(c, m))
-			/* flush it if result != 0 */
-			i2o_flush_reply(c, m);
-
-		rc = IRQ_HANDLED;
-	}
-
-	return rc;
-}
-
-/**
- *	i2o_pci_irq_enable - Allocate interrupt for I2O controller
- *	@c: i2o_controller that the request is for
- *
- *	Allocate an interrupt for the I2O controller, and activate interrupts
- *	on the I2O controller.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_pci_irq_enable(struct i2o_controller *c)
-{
-	struct pci_dev *pdev = c->pdev;
-	int rc;
-
-	writel(0xffffffff, c->irq_mask);
-
-	if (pdev->irq) {
-		rc = request_irq(pdev->irq, i2o_pci_interrupt, IRQF_SHARED,
-				 c->name, c);
-		if (rc < 0) {
-			printk(KERN_ERR "%s: unable to allocate interrupt %d."
-			       "\n", c->name, pdev->irq);
-			return rc;
-		}
-	}
-
-	writel(0x00000000, c->irq_mask);
-
-	printk(KERN_INFO "%s: Installed at IRQ %d\n", c->name, pdev->irq);
-
-	return 0;
-}
-
-/**
- *	i2o_pci_irq_disable - Free interrupt for I2O controller
- *	@c: I2O controller
- *
- *	Disable interrupts in I2O controller and then free interrupt.
- */
-static void i2o_pci_irq_disable(struct i2o_controller *c)
-{
-	writel(0xffffffff, c->irq_mask);
-
-	if (c->pdev->irq > 0)
-		free_irq(c->pdev->irq, c);
-}
-
-/**
- *	i2o_pci_probe - Probe the PCI device for an I2O controller
- *	@pdev: PCI device to test
- *	@id: id which matched with the PCI device id table
- *
- *	Probe the PCI device for any device which is a memory of the
- *	Intelligent, I2O class or an Adaptec Zero Channel Controller. We
- *	attempt to set up each such device and register it with the core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-{
-	struct i2o_controller *c;
-	int rc;
-	struct pci_dev *i960 = NULL;
-
-	printk(KERN_INFO "i2o: Checking for PCI I2O controllers...\n");
-
-	if ((pdev->class & 0xff) > 1) {
-		printk(KERN_WARNING "i2o: %s does not support I2O 1.5 "
-		       "(skipping).\n", pci_name(pdev));
-		return -ENODEV;
-	}
-
-	if ((rc = pci_enable_device(pdev))) {
-		printk(KERN_WARNING "i2o: couldn't enable device %s\n",
-		       pci_name(pdev));
-		return rc;
-	}
-
-	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
-		printk(KERN_WARNING "i2o: no suitable DMA found for %s\n",
-		       pci_name(pdev));
-		rc = -ENODEV;
-		goto disable;
-	}
-
-	pci_set_master(pdev);
-
-	c = i2o_iop_alloc();
-	if (IS_ERR(c)) {
-		printk(KERN_ERR "i2o: couldn't allocate memory for %s\n",
-		       pci_name(pdev));
-		rc = PTR_ERR(c);
-		goto disable;
-	} else
-		printk(KERN_INFO "%s: controller found (%s)\n", c->name,
-		       pci_name(pdev));
-
-	c->pdev = pdev;
-	c->device.parent = &pdev->dev;
-
-	/* Cards that fall apart if you hit them with large I/O loads... */
-	if (pdev->vendor == PCI_VENDOR_ID_NCR && pdev->device == 0x0630) {
-		c->short_req = 1;
-		printk(KERN_INFO "%s: Symbios FC920 workarounds activated.\n",
-		       c->name);
-	}
-
-	if (pdev->subsystem_vendor == PCI_VENDOR_ID_PROMISE) {
-		/*
-		 * Expose the ship behind i960 for initialization, or it will
-		 * failed
-		 */
-		i960 = pci_get_slot(c->pdev->bus,
-				  PCI_DEVFN(PCI_SLOT(c->pdev->devfn), 0));
-
-		if (i960) {
-			pci_write_config_word(i960, 0x42, 0);
-			pci_dev_put(i960);
-		}
-
-		c->promise = 1;
-		c->limit_sectors = 1;
-	}
-
-	if (pdev->subsystem_vendor == PCI_VENDOR_ID_DPT)
-		c->adaptec = 1;
-
-	/* Cards that go bananas if you quiesce them before you reset them. */
-	if (pdev->vendor == PCI_VENDOR_ID_DPT) {
-		c->no_quiesce = 1;
-		if (pdev->device == 0xa511)
-			c->raptor = 1;
-
-		if (pdev->subsystem_device == 0xc05a) {
-			c->limit_sectors = 1;
-			printk(KERN_INFO
-			       "%s: limit sectors per request to %d\n", c->name,
-			       I2O_MAX_SECTORS_LIMITED);
-		}
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if (sizeof(dma_addr_t) > 4) {
-			if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)))
-				printk(KERN_INFO "%s: 64-bit DMA unavailable\n",
-				       c->name);
-			else {
-				c->pae_support = 1;
-				printk(KERN_INFO "%s: using 64-bit DMA\n",
-				       c->name);
-			}
-		}
-#endif
-	}
-
-	if ((rc = i2o_pci_alloc(c))) {
-		printk(KERN_ERR "%s: DMA / IO allocation for I2O controller "
-		       "failed\n", c->name);
-		goto free_controller;
-	}
-
-	if (i2o_pci_irq_enable(c)) {
-		printk(KERN_ERR "%s: unable to enable interrupts for I2O "
-		       "controller\n", c->name);
-		goto free_pci;
-	}
-
-	if ((rc = i2o_iop_add(c)))
-		goto uninstall;
-
-	if (i960)
-		pci_write_config_word(i960, 0x42, 0x03ff);
-
-	return 0;
-
-      uninstall:
-	i2o_pci_irq_disable(c);
-
-      free_pci:
-	i2o_pci_free(c);
-
-      free_controller:
-	i2o_iop_free(c);
-
-      disable:
-	pci_disable_device(pdev);
-
-	return rc;
-}
-
-/**
- *	i2o_pci_remove - Removes a I2O controller from the system
- *	@pdev: I2O controller which should be removed
- *
- *	Reset the I2O controller, disable interrupts and remove all allocated
- *	resources.
- */
-static void i2o_pci_remove(struct pci_dev *pdev)
-{
-	struct i2o_controller *c;
-	c = pci_get_drvdata(pdev);
-
-	i2o_iop_remove(c);
-	i2o_pci_irq_disable(c);
-	i2o_pci_free(c);
-
-	pci_disable_device(pdev);
-
-	printk(KERN_INFO "%s: Controller removed.\n", c->name);
-
-	put_device(&c->device);
-};
-
-/* PCI driver for I2O controller */
-static struct pci_driver i2o_pci_driver = {
-	.name = "PCI_I2O",
-	.id_table = i2o_pci_ids,
-	.probe = i2o_pci_probe,
-	.remove = i2o_pci_remove,
-};
-
-/**
- *	i2o_pci_init - registers I2O PCI driver in PCI subsystem
- *
- *	Returns > 0 on success or negative error code on failure.
- */
-int __init i2o_pci_init(void)
-{
-	return pci_register_driver(&i2o_pci_driver);
-};
-
-/**
- *	i2o_pci_exit - unregisters I2O PCI driver from PCI subsystem
- */
-void __exit i2o_pci_exit(void)
-{
-	pci_unregister_driver(&i2o_pci_driver);
-};
-
-MODULE_DEVICE_TABLE(pci, i2o_pci_ids);
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 071ac116818f..9e52bcd5356d 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -110,4 +110,6 @@ source "drivers/staging/clocking-wizard/Kconfig"
 
 source "drivers/staging/fbtft/Kconfig"
 
+source "drivers/staging/i2o/Kconfig"
+
 endif # STAGING
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index b65ca376d957..6e0ac524c84d 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -47,3 +47,4 @@ obj-$(CONFIG_CRYPTO_SKEIN)	+= skein/
 obj-$(CONFIG_UNISYSSPAR)	+= unisys/
 obj-$(CONFIG_COMMON_CLK_XLNX_CLKWZRD)	+= clocking-wizard/
 obj-$(CONFIG_FB_TFT)		+= fbtft/
+obj-$(CONFIG_I2O)		+= i2o/
diff --git a/drivers/staging/i2o/Kconfig b/drivers/staging/i2o/Kconfig
new file mode 100644
index 000000000000..286c53f4b13d
--- /dev/null
+++ b/drivers/staging/i2o/Kconfig
@@ -0,0 +1,120 @@
+menuconfig I2O
+	tristate "I2O device support"
+	depends on PCI
+	---help---
+	  The Intelligent Input/Output (I2O) architecture allows hardware
+	  drivers to be split into two parts: an operating system specific
+	  module called the OSM and an hardware specific module called the
+	  HDM. The OSM can talk to a whole range of HDM's, and ideally the
+	  HDM's are not OS dependent. This allows for the same HDM driver to
+	  be used under different operating systems if the relevant OSM is in
+	  place. In order for this to work, you need to have an I2O interface
+	  adapter card in your computer. This card contains a special I/O
+	  processor (IOP), thus allowing high speeds since the CPU does not
+	  have to deal with I/O.
+
+	  If you say Y here, you will get a choice of interface adapter
+	  drivers and OSM's with the following questions.
+
+	  To compile this support as a module, choose M here: the
+	  modules will be called i2o_core.
+
+	  If unsure, say N.
+
+if I2O
+
+config I2O_LCT_NOTIFY_ON_CHANGES
+	bool "Enable LCT notification"
+	default y
+	---help---
+	  Only say N here if you have a I2O controller from SUN. The SUN
+	  firmware doesn't support LCT notification on changes. If this option
+	  is enabled on such a controller the driver will hang up in a endless
+	  loop. On all other controllers say Y.
+
+	  If unsure, say Y.
+
+config I2O_EXT_ADAPTEC
+	bool "Enable Adaptec extensions"
+	default y
+	---help---
+	  Say Y for support of raidutils for Adaptec I2O controllers. You also
+	  have to say Y to "I2O Configuration support", "I2O SCSI OSM" below
+	  and to "SCSI generic support" under "SCSI device configuration".
+
+config I2O_EXT_ADAPTEC_DMA64
+	bool "Enable 64-bit DMA"
+	depends on I2O_EXT_ADAPTEC && ( 64BIT || HIGHMEM64G )
+	default y
+	---help---
+	  Say Y for support of 64-bit DMA transfer mode on Adaptec I2O
+	  controllers.
+	  Note: You need at least firmware version 3709.
+
+config I2O_CONFIG
+	tristate "I2O Configuration support"
+	depends on VIRT_TO_BUS
+	---help---
+	  Say Y for support of the configuration interface for the I2O adapters.
+	  If you have a RAID controller from Adaptec and you want to use the
+	  raidutils to manage your RAID array, you have to say Y here.
+
+	  To compile this support as a module, choose M here: the
+	  module will be called i2o_config.
+
+	  Note: If you want to use the new API you have to download the
+	  i2o_config patch from http://i2o.shadowconnect.com/
+
+config I2O_CONFIG_OLD_IOCTL
+	bool "Enable ioctls (OBSOLETE)"
+	depends on I2O_CONFIG
+	default y
+	---help---
+	  Enables old ioctls.
+
+config I2O_BUS
+	tristate "I2O Bus Adapter OSM"
+	---help---
+	  Include support for the I2O Bus Adapter OSM. The Bus Adapter OSM
+	  provides access to the busses on the I2O controller. The main purpose
+	  is to rescan the bus to find new devices.
+
+	  To compile this support as a module, choose M here: the
+	  module will be called i2o_bus.
+
+config I2O_BLOCK
+	tristate "I2O Block OSM"
+	depends on BLOCK
+	---help---
+	  Include support for the I2O Block OSM. The Block OSM presents disk
+	  and other structured block devices to the operating system. If you
+	  are using an RAID controller, you could access the array only by
+	  the Block OSM driver. But it is possible to access the single disks
+	  by the SCSI OSM driver, for example to monitor the disks.
+
+	  To compile this support as a module, choose M here: the
+	  module will be called i2o_block.
+
+config I2O_SCSI
+	tristate "I2O SCSI OSM"
+	depends on SCSI
+	---help---
+	  Allows direct SCSI access to SCSI devices on a SCSI or FibreChannel
+	  I2O controller. You can use both the SCSI and Block OSM together if
+	  you wish. To access a RAID array, you must use the Block OSM driver.
+	  But you could use the SCSI OSM driver to monitor the single disks.
+
+	  To compile this support as a module, choose M here: the
+	  module will be called i2o_scsi.
+
+config I2O_PROC
+	tristate "I2O /proc support"
+	---help---
+	  If you say Y here and to "/proc file system support", you will be
+	  able to read I2O related information from the virtual directory
+	  /proc/i2o.
+
+	  To compile this support as a module, choose M here: the
+	  module will be called i2o_proc.
+
+endif # I2O
diff --git a/drivers/staging/i2o/Makefile b/drivers/staging/i2o/Makefile
new file mode 100644
index 000000000000..b0982dacfd0a
--- /dev/null
+++ b/drivers/staging/i2o/Makefile
@@ -0,0 +1,16 @@
+#
+# Makefile for the kernel I2O OSM.
+#
+# Note : at this point, these files are compiled on all systems.
+# In the future, some of these should be built conditionally.
+#
+
+i2o_core-y		+= iop.o driver.o device.o debug.o pci.o exec-osm.o memory.o
+i2o_bus-y		+= bus-osm.o
+i2o_config-y		+= config-osm.o
+obj-$(CONFIG_I2O)	+= i2o_core.o
+obj-$(CONFIG_I2O_CONFIG)+= i2o_config.o
+obj-$(CONFIG_I2O_BUS)	+= i2o_bus.o
+obj-$(CONFIG_I2O_BLOCK)	+= i2o_block.o
+obj-$(CONFIG_I2O_SCSI)	+= i2o_scsi.o
+obj-$(CONFIG_I2O_PROC)	+= i2o_proc.o
diff --git a/drivers/staging/i2o/README b/drivers/staging/i2o/README
new file mode 100644
index 000000000000..f072a8eb3041
--- /dev/null
+++ b/drivers/staging/i2o/README
@@ -0,0 +1,98 @@
+
+	Linux I2O Support	(c) Copyright 1999 Red Hat Software
+					and others.
+
+	This program is free software; you can redistribute it and/or
+	modify it under the terms of the GNU General Public License
+	as published by the Free Software Foundation; either version
+	2 of the License, or (at your option) any later version.
+
+AUTHORS (so far)
+
+Alan Cox, Building Number Three Ltd.
+	Core code, SCSI and Block OSMs
+
+Steve Ralston, LSI Logic Corp.
+	Debugging SCSI and Block OSM
+
+Deepak Saxena, Intel Corp.
+	Various core/block extensions
+	/proc interface, bug fixes
+	Ioctl interfaces for control
+	Debugging LAN OSM
+
+Philip Rumpf
+	Fixed assorted dumb SMP locking bugs
+
+Juha Sievanen, University of Helsinki Finland
+	LAN OSM code
+	/proc interface to LAN class
+	Bug fixes
+	Core code extensions
+
+Auvo Häkkinen, University of Helsinki Finland
+	LAN OSM code
+	/Proc interface to LAN class
+	Bug fixes
+	Core code extensions
+
+Taneli Vähäkangas, University of Helsinki Finland
+	Fixes to i2o_config
+
+CREDITS
+
+	This work was made possible by 
+
+Red Hat Software
+	Funding for the Building #3 part of the project
+
+Symbios Logic (Now LSI)
+	Host adapters, hints, known to work platforms when I hit
+	compatibility problems
+
+BoxHill Corporation
+	Loan of initial FibreChannel disk array used for development work.
+
+European Commission
+	Funding the work done by the University of Helsinki
+
+SysKonnect
+        Loan of FDDI and Gigabit Ethernet cards
+
+ASUSTeK
+        Loan of I2O motherboard 
+
+STATUS:
+
+o	The core setup works within limits.
+o	The scsi layer seems to almost work. 
+           I'm still chasing down the hang bug.
+o	The block OSM is mostly functional
+o	LAN OSM works with FDDI and Ethernet cards.
+
+TO DO:
+
+General:
+o	Provide hidden address space if asked
+o	Long term message flow control
+o	PCI IOP's without interrupts are not supported yet
+o	Push FAIL handling into the core
+o	DDM control interfaces for module load etc
+o       Add I2O 2.0 support (Deffered to 2.5 kernel)
+
+Block:
+o	Multiple major numbers
+o	Read ahead and cache handling stuff. Talk to Ingo and people
+o	Power management
+o	Finish Media changers
+
+SCSI:
+o	Find the right way to associate drives/luns/busses
+
+Lan:	
+o	Performance tuning
+o	Test Fibre Channel code
+
+Tape:
+o	Anyone seen anything implementing this ?
+           (D.S: Will attempt to do so if spare cycles permit)
diff --git a/drivers/staging/i2o/README.ioctl b/drivers/staging/i2o/README.ioctl
new file mode 100644
index 000000000000..4a7d2ebdfc97
--- /dev/null
+++ b/drivers/staging/i2o/README.ioctl
@@ -0,0 +1,394 @@
+
+Linux I2O User Space Interface
+rev 0.3 - 04/20/99
+
+=============================================================================
+Originally written by Deepak Saxena(deepak@plexity.net)
+Currently maintained by Deepak Saxena(deepak@plexity.net)
+=============================================================================
+
+I. Introduction
+
+The Linux I2O subsystem provides a set of ioctl() commands that can be
+utilized by user space applications to communicate with IOPs and devices
+on individual IOPs. This document defines the specific ioctl() commands
+that are available to the user and provides examples of their uses.
+
+This document assumes the reader is familiar with or has access to the 
+I2O specification as no I2O message parameters are outlined.  For information 
+on the specification, see http://www.i2osig.org
+
+This document and the I2O user space interface are currently maintained
+by Deepak Saxena.  Please send all comments, errata, and bug fixes to
+deepak@csociety.purdue.edu
+
+II. IOP Access
+
+Access to the I2O subsystem is provided through the device file named 
+/dev/i2o/ctl.  This file is a character file with major number 10 and minor
+number 166.  It can be created through the following command:
+
+   mknod /dev/i2o/ctl c 10 166
+
+III. Determining the IOP Count
+
+   SYNOPSIS 
+
+   ioctl(fd, I2OGETIOPS,  int *count);
+
+   u8 count[MAX_I2O_CONTROLLERS];
+
+   DESCRIPTION
+
+   This function returns the system's active IOP table.  count should
+   point to a buffer containing MAX_I2O_CONTROLLERS entries.  Upon 
+   returning, each entry will contain a non-zero value if the given
+   IOP unit is active, and NULL if it is inactive or non-existent.
+
+   RETURN VALUE.
+
+   Returns 0 if no errors occur, and -1 otherwise.  If an error occurs,
+   errno is set appropriately:
+
+     EFAULT   Invalid user space pointer was passed
+
+IV. Getting Hardware Resource Table
+
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OHRTGET, struct i2o_cmd_hrt *hrt);
+
+      struct i2o_cmd_hrtlct
+      {
+         u32   iop;      /* IOP unit number */
+         void  *resbuf;  /* Buffer for result */
+         u32   *reslen;  /* Buffer length in bytes */
+      };
+
+   DESCRIPTION
+
+   This function returns the Hardware Resource Table of the IOP specified 
+   by hrt->iop in the buffer pointed to by hrt->resbuf. The actual size of 
+   the data is written into *(hrt->reslen).
+
+   RETURNS
+
+   This function returns 0 if no errors occur. If an error occurs, -1 
+   is returned and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ENOBUFS     Buffer not large enough.  If this occurs, the required
+                  buffer length is written into *(hrt->reslen)
+  
+V. Getting Logical Configuration Table
+   
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OLCTGET, struct i2o_cmd_lct *lct);
+
+      struct i2o_cmd_hrtlct
+      {
+         u32   iop;      /* IOP unit number */
+         void  *resbuf;  /* Buffer for result */
+         u32   *reslen;  /* Buffer length in bytes */
+      };
+
+   DESCRIPTION
+
+   This function returns the Logical Configuration Table of the IOP specified
+   by lct->iop in the buffer pointed to by lct->resbuf. The actual size of 
+   the data is written into *(lct->reslen).
+
+   RETURNS
+
+   This function returns 0 if no errors occur. If an error occurs, -1 
+   is returned and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ENOBUFS     Buffer not large enough.  If this occurs, the required
+                  buffer length is written into *(lct->reslen)
+
+VI. Setting Parameters
+   
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OPARMSET, struct i2o_parm_setget *ops);
+
+      struct i2o_cmd_psetget
+      {
+         u32   iop;      /* IOP unit number */
+         u32   tid;      /* Target device TID */
+         void  *opbuf;   /* Operation List buffer */
+         u32   oplen;    /* Operation List buffer length in bytes */
+         void  *resbuf;  /* Result List buffer */
+         u32   *reslen;  /* Result List buffer length in bytes */
+      };
+
+   DESCRIPTION
+
+   This function posts a UtilParamsSet message to the device identified
+   by ops->iop and ops->tid.  The operation list for the message is 
+   sent through the ops->opbuf buffer, and the result list is written
+   into the buffer pointed to by ops->resbuf.  The number of bytes 
+   written is placed into *(ops->reslen). 
+
+   RETURNS
+
+   The return value is the size in bytes of the data written into
+   ops->resbuf if no errors occur.  If an error occurs, -1 is returned 
+   and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ENOBUFS     Buffer not large enough.  If this occurs, the required
+                  buffer length is written into *(ops->reslen)
+      ETIMEDOUT   Timeout waiting for reply message
+      ENOMEM      Kernel memory allocation error
+
+   A return value of 0 does not mean that the value was actually
+   changed properly on the IOP.  The user should check the result
+   list to determine the specific status of the transaction.
+
+VII. Getting Parameters
+   
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OPARMGET, struct i2o_parm_setget *ops);
+
+      struct i2o_parm_setget
+      {
+         u32   iop;      /* IOP unit number */
+         u32   tid;      /* Target device TID */
+         void  *opbuf;   /* Operation List buffer */
+         u32   oplen;    /* Operation List buffer length in bytes */
+         void  *resbuf;  /* Result List buffer */
+         u32   *reslen;  /* Result List buffer length in bytes */
+      };
+
+   DESCRIPTION
+
+   This function posts a UtilParamsGet message to the device identified
+   by ops->iop and ops->tid.  The operation list for the message is 
+   sent through the ops->opbuf buffer, and the result list is written
+   into the buffer pointed to by ops->resbuf.  The actual size of data
+   written is placed into *(ops->reslen).
+
+   RETURNS
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ENOBUFS     Buffer not large enough.  If this occurs, the required
+                  buffer length is written into *(ops->reslen)
+      ETIMEDOUT   Timeout waiting for reply message
+      ENOMEM      Kernel memory allocation error
+
+   A return value of 0 does not mean that the value was actually
+   properly retrieved.  The user should check the result list 
+   to determine the specific status of the transaction.
+
+VIII. Downloading Software
+   
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OSWDL, struct i2o_sw_xfer *sw);
+
+      struct i2o_sw_xfer
+      {
+         u32   iop;       /* IOP unit number */
+         u8    flags;     /* DownloadFlags field */
+         u8    sw_type;   /* Software type */
+         u32   sw_id;     /* Software ID */
+         void  *buf;      /* Pointer to software buffer */
+         u32   *swlen;    /* Length of software buffer */        
+         u32   *maxfrag;  /* Number of fragments */
+         u32   *curfrag;  /* Current fragment number */
+      };
+
+   DESCRIPTION
+
+   This function downloads a software fragment pointed by sw->buf
+   to the iop identified by sw->iop. The DownloadFlags, SwID, SwType
+   and SwSize fields of the ExecSwDownload message are filled in with
+   the values of sw->flags, sw->sw_id, sw->sw_type and *(sw->swlen).
+
+   The fragments _must_ be sent in order and be 8K in size. The last
+   fragment _may_ be shorter, however. The kernel will compute its
+   size based on information in the sw->swlen field.
+
+   Please note that SW transfers can take a long time.
+
+   RETURNS
+
+   This function returns 0 no errors occur. If an error occurs, -1 
+   is returned and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ETIMEDOUT   Timeout waiting for reply message
+      ENOMEM      Kernel memory allocation error
+
+IX. Uploading Software
+   
+   SYNOPSIS 
+
+   ioctl(fd, I2OSWUL, struct i2o_sw_xfer *sw);
+
+      struct i2o_sw_xfer
+      {
+         u32   iop;      /* IOP unit number */
+         u8    flags; 	 /* UploadFlags */
+         u8    sw_type;  /* Software type */
+         u32   sw_id;    /* Software ID */
+         void  *buf;     /* Pointer to software buffer */
+         u32   *swlen;   /* Length of software buffer */        
+         u32   *maxfrag; /* Number of fragments */
+         u32   *curfrag; /* Current fragment number */
+      };
+
+   DESCRIPTION
+
+   This function uploads a software fragment from the IOP identified
+   by sw->iop, sw->sw_type, sw->sw_id and optionally sw->swlen fields.
+   The UploadFlags, SwID, SwType and SwSize fields of the ExecSwUpload
+   message are filled in with the values of sw->flags, sw->sw_id,
+   sw->sw_type and *(sw->swlen).
+
+   The fragments _must_ be requested in order and be 8K in size. The
+   user is responsible for allocating memory pointed by sw->buf. The
+   last fragment _may_ be shorter.
+
+   Please note that SW transfers can take a long time.
+
+   RETURNS
+
+   This function returns 0 if no errors occur.  If an error occurs, -1
+   is returned and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ETIMEDOUT   Timeout waiting for reply message
+      ENOMEM      Kernel memory allocation error
+         
+X. Removing Software
+   
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OSWDEL, struct i2o_sw_xfer *sw);
+
+      struct i2o_sw_xfer
+      {
+         u32   iop;      /* IOP unit number */
+         u8    flags; 	 /* RemoveFlags */
+         u8    sw_type;  /* Software type */
+         u32   sw_id;    /* Software ID */
+         void  *buf;     /* Unused */
+         u32   *swlen;   /* Length of the software data */        
+         u32   *maxfrag; /* Unused */
+         u32   *curfrag; /* Unused */
+      };
+
+   DESCRIPTION
+
+   This function removes software from the IOP identified by sw->iop.
+   The RemoveFlags, SwID, SwType and SwSize fields of the ExecSwRemove message 
+   are filled in with the values of sw->flags, sw->sw_id, sw->sw_type and 
+   *(sw->swlen). Give zero in *(sw->len) if the value is unknown. IOP uses 
+   *(sw->swlen) value to verify correct identication of the module to remove. 
+   The actual size of the module is written into *(sw->swlen).
+
+   RETURNS
+
+   This function returns 0 if no errors occur.  If an error occurs, -1
+   is returned and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ETIMEDOUT   Timeout waiting for reply message
+      ENOMEM      Kernel memory allocation error
+
+X. Validating Configuration
+
+   SYNOPSIS
+
+   ioctl(fd, I2OVALIDATE, int *iop);
+	u32 iop;
+
+   DESCRIPTION
+
+   This function posts an ExecConfigValidate message to the controller
+   identified by iop. This message indicates that the current
+   configuration is accepted. The iop changes the status of suspect drivers 
+   to valid and may delete old drivers from its store.
+
+   RETURNS
+
+   This function returns 0 if no erro occur.  If an error occurs, -1 is
+   returned and errno is set appropriately:
+
+      ETIMEDOUT   Timeout waiting for reply message
+      ENXIO       Invalid IOP number
+
+XI. Configuration Dialog
+   
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OHTML, struct i2o_html *htquery);
+      struct i2o_html
+      {
+         u32   iop;      /* IOP unit number */
+         u32   tid;      /* Target device ID */
+         u32   page;     /* HTML page */
+         void  *resbuf;  /* Buffer for reply HTML page */
+         u32   *reslen;  /* Length in bytes of reply buffer */
+         void  *qbuf;    /* Pointer to HTTP query string */
+         u32   qlen;     /* Length in bytes of query string buffer */        
+      };
+
+   DESCRIPTION
+
+   This function posts an UtilConfigDialog message to the device identified
+   by htquery->iop and htquery->tid.  The requested HTML page number is 
+   provided by the htquery->page field, and the resultant data is stored 
+   in the buffer pointed to by htquery->resbuf.  If there is an HTTP query 
+   string that is to be sent to the device, it should be sent in the buffer
+   pointed to by htquery->qbuf.  If there is no query string, this field
+   should be set to NULL. The actual size of the reply received is written
+   into *(htquery->reslen).
+  
+   RETURNS
+
+   This function returns 0 if no error occur. If an error occurs, -1
+   is returned and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ENOBUFS     Buffer not large enough.  If this occurs, the required
+                  buffer length is written into *(ops->reslen)
+      ETIMEDOUT   Timeout waiting for reply message
+      ENOMEM      Kernel memory allocation error
+
+XII. Events
+
+    In the process of determining this.  Current idea is to have use
+    the select() interface to allow user apps to periodically poll
+    the /dev/i2o/ctl device for events.  When select() notifies the user
+    that an event is available, the user would call read() to retrieve
+    a list of all the events that are pending for the specific device.
+
+=============================================================================
+Revision History
+=============================================================================
+
+Rev 0.1 - 04/01/99
+- Initial revision
+
+Rev 0.2 - 04/06/99
+- Changed return values to match UNIX ioctl() standard.  Only return values
+  are 0 and -1.  All errors are reported through errno.
+- Added summary of proposed possible event interfaces
+
+Rev 0.3 - 04/20/99
+- Changed all ioctls() to use pointers to user data instead of actual data
+- Updated error values to match the code
diff --git a/drivers/staging/i2o/bus-osm.c b/drivers/staging/i2o/bus-osm.c
new file mode 100644
index 000000000000..7aa0339aea05
--- /dev/null
+++ b/drivers/staging/i2o/bus-osm.c
@@ -0,0 +1,176 @@
+/*
+ *	Bus Adapter OSM
+ *
+ *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	Fixes/additions:
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			initial version.
+ */
+
+#include <linux/module.h>
+#include "i2o.h"
+
+#define OSM_NAME	"bus-osm"
+#define OSM_VERSION	"1.317"
+#define OSM_DESCRIPTION	"I2O Bus Adapter OSM"
+
+static struct i2o_driver i2o_bus_driver;
+
+/* Bus OSM class handling definition */
+static struct i2o_class_id i2o_bus_class_id[] = {
+	{I2O_CLASS_BUS_ADAPTER},
+	{I2O_CLASS_END}
+};
+
+/**
+ *	i2o_bus_scan - Scan the bus for new devices
+ *	@dev: I2O device of the bus, which should be scanned
+ *
+ *	Scans the bus dev for new / removed devices. After the scan a new LCT
+ *	will be fetched automatically.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_bus_scan(struct i2o_device *dev)
+{
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return -ETIMEDOUT;
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BUS_SCAN << 24 | HOST_TID << 12 | dev->lct_data.
+			tid);
+
+	return i2o_msg_post_wait(dev->iop, msg, 60);
+};
+
+/**
+ *	i2o_bus_store_scan - Scan the I2O Bus Adapter
+ *	@d: device which should be scanned
+ *	@attr: device_attribute
+ *	@buf: output buffer
+ *	@count: buffer size
+ *
+ *	Returns count.
+ */
+static ssize_t i2o_bus_store_scan(struct device *d,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(d);
+	int rc;
+
+	if ((rc = i2o_bus_scan(i2o_dev)))
+		osm_warn("bus scan failed %d\n", rc);
+
+	return count;
+}
+
+/* Bus Adapter OSM device attributes */
+static DEVICE_ATTR(scan, S_IWUSR, NULL, i2o_bus_store_scan);
+
+/**
+ *	i2o_bus_probe - verify if dev is a I2O Bus Adapter device and install it
+ *	@dev: device to verify if it is a I2O Bus Adapter device
+ *
+ *	Because we want all Bus Adapters always return 0.
+ *	Except when we fail.  Then we are sad.
+ *
+ *	Returns 0, except when we fail to excel.
+ */
+static int i2o_bus_probe(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(get_device(dev));
+	int rc;
+
+	rc = device_create_file(dev, &dev_attr_scan);
+	if (rc)
+		goto err_out;
+
+	osm_info("device added (TID: %03x)\n", i2o_dev->lct_data.tid);
+
+	return 0;
+
+err_out:
+	put_device(dev);
+	return rc;
+};
+
+/**
+ *	i2o_bus_remove - remove the I2O Bus Adapter device from the system again
+ *	@dev: I2O Bus Adapter device which should be removed
+ *
+ *	Always returns 0.
+ */
+static int i2o_bus_remove(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+
+	device_remove_file(dev, &dev_attr_scan);
+
+	put_device(dev);
+
+	osm_info("device removed (TID: %03x)\n", i2o_dev->lct_data.tid);
+
+	return 0;
+};
+
+/* Bus Adapter OSM driver struct */
+static struct i2o_driver i2o_bus_driver = {
+	.name = OSM_NAME,
+	.classes = i2o_bus_class_id,
+	.driver = {
+		   .probe = i2o_bus_probe,
+		   .remove = i2o_bus_remove,
+		   },
+};
+
+/**
+ *	i2o_bus_init - Bus Adapter OSM initialization function
+ *
+ *	Only register the Bus Adapter OSM in the I2O core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_bus_init(void)
+{
+	int rc;
+
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	/* Register Bus Adapter OSM into I2O core */
+	rc = i2o_driver_register(&i2o_bus_driver);
+	if (rc) {
+		osm_err("Could not register Bus Adapter OSM\n");
+		return rc;
+	}
+
+	return 0;
+};
+
+/**
+ *	i2o_bus_exit - Bus Adapter OSM exit function
+ *
+ *	Unregisters Bus Adapter OSM from I2O core.
+ */
+static void __exit i2o_bus_exit(void)
+{
+	i2o_driver_unregister(&i2o_bus_driver);
+};
+
+MODULE_AUTHOR("Markus Lidel <Markus.Lidel@shadowconnect.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+module_init(i2o_bus_init);
+module_exit(i2o_bus_exit);
diff --git a/drivers/staging/i2o/config-osm.c b/drivers/staging/i2o/config-osm.c
new file mode 100644
index 000000000000..519f52f9f688
--- /dev/null
+++ b/drivers/staging/i2o/config-osm.c
@@ -0,0 +1,90 @@
+/*
+ *	Configuration OSM
+ *
+ *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	Fixes/additions:
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			initial version.
+ */
+
+#include <linux/module.h>
+#include "i2o.h"
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+
+#define OSM_NAME	"config-osm"
+#define OSM_VERSION	"1.323"
+#define OSM_DESCRIPTION	"I2O Configuration OSM"
+
+/* access mode user rw */
+#define S_IWRSR (S_IRUSR | S_IWUSR)
+
+static struct i2o_driver i2o_config_driver;
+
+/* Config OSM driver struct */
+static struct i2o_driver i2o_config_driver = {
+	.name = OSM_NAME,
+};
+
+#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
+#include "i2o_config.c"
+#endif
+
+/**
+ *	i2o_config_init - Configuration OSM initialization function
+ *
+ *	Registers Configuration OSM in the I2O core and if old ioctl's are
+ *	compiled in initialize them.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_config_init(void)
+{
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	if (i2o_driver_register(&i2o_config_driver)) {
+		osm_err("handler register failed.\n");
+		return -EBUSY;
+	}
+#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
+	if (i2o_config_old_init()) {
+		osm_err("old config handler initialization failed\n");
+		i2o_driver_unregister(&i2o_config_driver);
+		return -EBUSY;
+	}
+#endif
+
+	return 0;
+}
+
+/**
+ *	i2o_config_exit - Configuration OSM exit function
+ *
+ *	If old ioctl's are compiled in exit remove them and unregisters
+ *	Configuration OSM from I2O core.
+ */
+static void i2o_config_exit(void)
+{
+#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
+	i2o_config_old_exit();
+#endif
+
+	i2o_driver_unregister(&i2o_config_driver);
+}
+
+MODULE_AUTHOR("Markus Lidel <Markus.Lidel@shadowconnect.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+module_init(i2o_config_init);
+module_exit(i2o_config_exit);
diff --git a/drivers/staging/i2o/core.h b/drivers/staging/i2o/core.h
new file mode 100644
index 000000000000..91614f11f89a
--- /dev/null
+++ b/drivers/staging/i2o/core.h
@@ -0,0 +1,69 @@
+/*
+ *	I2O core internal declarations
+ *
+ *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	Fixes/additions:
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			initial version.
+ */
+
+/* Exec-OSM */
+extern struct i2o_driver i2o_exec_driver;
+extern int i2o_exec_lct_get(struct i2o_controller *);
+
+extern int __init i2o_exec_init(void);
+extern void i2o_exec_exit(void);
+
+/* driver */
+extern struct bus_type i2o_bus_type;
+
+extern int i2o_driver_dispatch(struct i2o_controller *, u32);
+
+extern int __init i2o_driver_init(void);
+extern void i2o_driver_exit(void);
+
+/* PCI */
+extern int __init i2o_pci_init(void);
+extern void __exit i2o_pci_exit(void);
+
+/* device */
+extern const struct attribute_group *i2o_device_groups[];
+
+extern void i2o_device_remove(struct i2o_device *);
+extern int i2o_device_parse_lct(struct i2o_controller *);
+
+int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
+		   int oplen, void *reslist, int reslen);
+
+/* IOP */
+extern struct i2o_controller *i2o_iop_alloc(void);
+
+/**
+ *	i2o_iop_free - Free the i2o_controller struct
+ *	@c: I2O controller to free
+ */
+static inline void i2o_iop_free(struct i2o_controller *c)
+{
+	i2o_pool_free(&c->in_msg);
+	kfree(c);
+}
+
+extern int i2o_iop_add(struct i2o_controller *);
+extern void i2o_iop_remove(struct i2o_controller *);
+
+/* control registers relative to c->base */
+#define I2O_IRQ_STATUS	0x30
+#define I2O_IRQ_MASK	0x34
+#define I2O_IN_PORT	0x40
+#define I2O_OUT_PORT	0x44
+
+/* Motorola/Freescale specific register offset */
+#define I2O_MOTOROLA_PORT_OFFSET	0x10400
+
+#define I2O_IRQ_OUTBOUND_POST	0x00000008
diff --git a/drivers/staging/i2o/debug.c b/drivers/staging/i2o/debug.c
new file mode 100644
index 000000000000..7a16114ed8ea
--- /dev/null
+++ b/drivers/staging/i2o/debug.c
@@ -0,0 +1,472 @@
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "i2o.h"
+
+static void i2o_report_util_cmd(u8 cmd);
+static void i2o_report_exec_cmd(u8 cmd);
+static void i2o_report_fail_status(u8 req_status, u32 * msg);
+static void i2o_report_common_status(u8 req_status);
+static void i2o_report_common_dsc(u16 detailed_status);
+
+/*
+ * Used for error reporting/debugging purposes.
+ * Report Cmd name, Request status, Detailed Status.
+ */
+void i2o_report_status(const char *severity, const char *str,
+		       struct i2o_message *m)
+{
+	u32 *msg = (u32 *) m;
+	u8 cmd = (msg[1] >> 24) & 0xFF;
+	u8 req_status = (msg[4] >> 24) & 0xFF;
+	u16 detailed_status = msg[4] & 0xFFFF;
+
+	if (cmd == I2O_CMD_UTIL_EVT_REGISTER)
+		return;		// No status in this reply
+
+	printk("%s%s: ", severity, str);
+
+	if (cmd < 0x1F)		// Utility cmd
+		i2o_report_util_cmd(cmd);
+
+	else if (cmd >= 0xA0 && cmd <= 0xEF)	// Executive cmd
+		i2o_report_exec_cmd(cmd);
+	else
+		printk("Cmd = %0#2x, ", cmd);	// Other cmds
+
+	if (msg[0] & MSG_FAIL) {
+		i2o_report_fail_status(req_status, msg);
+		return;
+	}
+
+	i2o_report_common_status(req_status);
+
+	if (cmd < 0x1F || (cmd >= 0xA0 && cmd <= 0xEF))
+		i2o_report_common_dsc(detailed_status);
+	else
+		printk(" / DetailedStatus = %0#4x.\n",
+		       detailed_status);
+}
+
+/* Used to dump a message to syslog during debugging */
+void i2o_dump_message(struct i2o_message *m)
+{
+#ifdef DEBUG
+	u32 *msg = (u32 *) m;
+	int i;
+	printk(KERN_INFO "Dumping I2O message size %d @ %p\n",
+	       msg[0] >> 16 & 0xffff, msg);
+	for (i = 0; i < ((msg[0] >> 16) & 0xffff); i++)
+		printk(KERN_INFO "  msg[%d] = %0#10x\n", i, msg[i]);
+#endif
+}
+
+/*
+ * Used for error reporting/debugging purposes.
+ * Following fail status are common to all classes.
+ * The preserved message must be handled in the reply handler.
+ */
+static void i2o_report_fail_status(u8 req_status, u32 * msg)
+{
+	static char *FAIL_STATUS[] = {
+		"0x80",		/* not used */
+		"SERVICE_SUSPENDED",	/* 0x81 */
+		"SERVICE_TERMINATED",	/* 0x82 */
+		"CONGESTION",
+		"FAILURE",
+		"STATE_ERROR",
+		"TIME_OUT",
+		"ROUTING_FAILURE",
+		"INVALID_VERSION",
+		"INVALID_OFFSET",
+		"INVALID_MSG_FLAGS",
+		"FRAME_TOO_SMALL",
+		"FRAME_TOO_LARGE",
+		"INVALID_TARGET_ID",
+		"INVALID_INITIATOR_ID",
+		"INVALID_INITIATOR_CONTEX",	/* 0x8F */
+		"UNKNOWN_FAILURE"	/* 0xFF */
+	};
+
+	if (req_status == I2O_FSC_TRANSPORT_UNKNOWN_FAILURE)
+		printk("TRANSPORT_UNKNOWN_FAILURE (%0#2x).\n",
+		       req_status);
+	else
+		printk("TRANSPORT_%s.\n",
+		       FAIL_STATUS[req_status & 0x0F]);
+
+	/* Dump some details */
+
+	printk(KERN_ERR "  InitiatorId = %d, TargetId = %d\n",
+	       (msg[1] >> 12) & 0xFFF, msg[1] & 0xFFF);
+	printk(KERN_ERR "  LowestVersion = 0x%02X, HighestVersion = 0x%02X\n",
+	       (msg[4] >> 8) & 0xFF, msg[4] & 0xFF);
+	printk(KERN_ERR "  FailingHostUnit = 0x%04X,  FailingIOP = 0x%03X\n",
+	       msg[5] >> 16, msg[5] & 0xFFF);
+
+	printk(KERN_ERR "  Severity:  0x%02X\n", (msg[4] >> 16) & 0xFF);
+	if (msg[4] & (1 << 16))
+		printk(KERN_DEBUG "(FormatError), "
+		       "this msg can never be delivered/processed.\n");
+	if (msg[4] & (1 << 17))
+		printk(KERN_DEBUG "(PathError), "
+		       "this msg can no longer be delivered/processed.\n");
+	if (msg[4] & (1 << 18))
+		printk(KERN_DEBUG "(PathState), "
+		       "the system state does not allow delivery.\n");
+	if (msg[4] & (1 << 19))
+		printk(KERN_DEBUG
+		       "(Congestion), resources temporarily not available;"
+		       "do not retry immediately.\n");
+}
+
+/*
+ * Used for error reporting/debugging purposes.
+ * Following reply status are common to all classes.
+ */
+static void i2o_report_common_status(u8 req_status)
+{
+	static char *REPLY_STATUS[] = {
+		"SUCCESS",
+		"ABORT_DIRTY",
+		"ABORT_NO_DATA_TRANSFER",
+		"ABORT_PARTIAL_TRANSFER",
+		"ERROR_DIRTY",
+		"ERROR_NO_DATA_TRANSFER",
+		"ERROR_PARTIAL_TRANSFER",
+		"PROCESS_ABORT_DIRTY",
+		"PROCESS_ABORT_NO_DATA_TRANSFER",
+		"PROCESS_ABORT_PARTIAL_TRANSFER",
+		"TRANSACTION_ERROR",
+		"PROGRESS_REPORT"
+	};
+
+	if (req_status >= ARRAY_SIZE(REPLY_STATUS))
+		printk("RequestStatus = %0#2x", req_status);
+	else
+		printk("%s", REPLY_STATUS[req_status]);
+}
+
+/*
+ * Used for error reporting/debugging purposes.
+ * Following detailed status are valid  for executive class,
+ * utility class, DDM class and for transaction error replies.
+ */
+static void i2o_report_common_dsc(u16 detailed_status)
+{
+	static char *COMMON_DSC[] = {
+		"SUCCESS",
+		"0x01",		// not used
+		"BAD_KEY",
+		"TCL_ERROR",
+		"REPLY_BUFFER_FULL",
+		"NO_SUCH_PAGE",
+		"INSUFFICIENT_RESOURCE_SOFT",
+		"INSUFFICIENT_RESOURCE_HARD",
+		"0x08",		// not used
+		"CHAIN_BUFFER_TOO_LARGE",
+		"UNSUPPORTED_FUNCTION",
+		"DEVICE_LOCKED",
+		"DEVICE_RESET",
+		"INAPPROPRIATE_FUNCTION",
+		"INVALID_INITIATOR_ADDRESS",
+		"INVALID_MESSAGE_FLAGS",
+		"INVALID_OFFSET",
+		"INVALID_PARAMETER",
+		"INVALID_REQUEST",
+		"INVALID_TARGET_ADDRESS",
+		"MESSAGE_TOO_LARGE",
+		"MESSAGE_TOO_SMALL",
+		"MISSING_PARAMETER",
+		"TIMEOUT",
+		"UNKNOWN_ERROR",
+		"UNKNOWN_FUNCTION",
+		"UNSUPPORTED_VERSION",
+		"DEVICE_BUSY",
+		"DEVICE_NOT_AVAILABLE"
+	};
+
+	if (detailed_status > I2O_DSC_DEVICE_NOT_AVAILABLE)
+		printk(" / DetailedStatus = %0#4x.\n",
+		       detailed_status);
+	else
+		printk(" / %s.\n", COMMON_DSC[detailed_status]);
+}
+
+/*
+ * Used for error reporting/debugging purposes
+ */
+static void i2o_report_util_cmd(u8 cmd)
+{
+	switch (cmd) {
+	case I2O_CMD_UTIL_NOP:
+		printk("UTIL_NOP, ");
+		break;
+	case I2O_CMD_UTIL_ABORT:
+		printk("UTIL_ABORT, ");
+		break;
+	case I2O_CMD_UTIL_CLAIM:
+		printk("UTIL_CLAIM, ");
+		break;
+	case I2O_CMD_UTIL_RELEASE:
+		printk("UTIL_CLAIM_RELEASE, ");
+		break;
+	case I2O_CMD_UTIL_CONFIG_DIALOG:
+		printk("UTIL_CONFIG_DIALOG, ");
+		break;
+	case I2O_CMD_UTIL_DEVICE_RESERVE:
+		printk("UTIL_DEVICE_RESERVE, ");
+		break;
+	case I2O_CMD_UTIL_DEVICE_RELEASE:
+		printk("UTIL_DEVICE_RELEASE, ");
+		break;
+	case I2O_CMD_UTIL_EVT_ACK:
+		printk("UTIL_EVENT_ACKNOWLEDGE, ");
+		break;
+	case I2O_CMD_UTIL_EVT_REGISTER:
+		printk("UTIL_EVENT_REGISTER, ");
+		break;
+	case I2O_CMD_UTIL_LOCK:
+		printk("UTIL_LOCK, ");
+		break;
+	case I2O_CMD_UTIL_LOCK_RELEASE:
+		printk("UTIL_LOCK_RELEASE, ");
+		break;
+	case I2O_CMD_UTIL_PARAMS_GET:
+		printk("UTIL_PARAMS_GET, ");
+		break;
+	case I2O_CMD_UTIL_PARAMS_SET:
+		printk("UTIL_PARAMS_SET, ");
+		break;
+	case I2O_CMD_UTIL_REPLY_FAULT_NOTIFY:
+		printk("UTIL_REPLY_FAULT_NOTIFY, ");
+		break;
+	default:
+		printk("Cmd = %0#2x, ", cmd);
+	}
+}
+
+/*
+ * Used for error reporting/debugging purposes
+ */
+static void i2o_report_exec_cmd(u8 cmd)
+{
+	switch (cmd) {
+	case I2O_CMD_ADAPTER_ASSIGN:
+		printk("EXEC_ADAPTER_ASSIGN, ");
+		break;
+	case I2O_CMD_ADAPTER_READ:
+		printk("EXEC_ADAPTER_READ, ");
+		break;
+	case I2O_CMD_ADAPTER_RELEASE:
+		printk("EXEC_ADAPTER_RELEASE, ");
+		break;
+	case I2O_CMD_BIOS_INFO_SET:
+		printk("EXEC_BIOS_INFO_SET, ");
+		break;
+	case I2O_CMD_BOOT_DEVICE_SET:
+		printk("EXEC_BOOT_DEVICE_SET, ");
+		break;
+	case I2O_CMD_CONFIG_VALIDATE:
+		printk("EXEC_CONFIG_VALIDATE, ");
+		break;
+	case I2O_CMD_CONN_SETUP:
+		printk("EXEC_CONN_SETUP, ");
+		break;
+	case I2O_CMD_DDM_DESTROY:
+		printk("EXEC_DDM_DESTROY, ");
+		break;
+	case I2O_CMD_DDM_ENABLE:
+		printk("EXEC_DDM_ENABLE, ");
+		break;
+	case I2O_CMD_DDM_QUIESCE:
+		printk("EXEC_DDM_QUIESCE, ");
+		break;
+	case I2O_CMD_DDM_RESET:
+		printk("EXEC_DDM_RESET, ");
+		break;
+	case I2O_CMD_DDM_SUSPEND:
+		printk("EXEC_DDM_SUSPEND, ");
+		break;
+	case I2O_CMD_DEVICE_ASSIGN:
+		printk("EXEC_DEVICE_ASSIGN, ");
+		break;
+	case I2O_CMD_DEVICE_RELEASE:
+		printk("EXEC_DEVICE_RELEASE, ");
+		break;
+	case I2O_CMD_HRT_GET:
+		printk("EXEC_HRT_GET, ");
+		break;
+	case I2O_CMD_ADAPTER_CLEAR:
+		printk("EXEC_IOP_CLEAR, ");
+		break;
+	case I2O_CMD_ADAPTER_CONNECT:
+		printk("EXEC_IOP_CONNECT, ");
+		break;
+	case I2O_CMD_ADAPTER_RESET:
+		printk("EXEC_IOP_RESET, ");
+		break;
+	case I2O_CMD_LCT_NOTIFY:
+		printk("EXEC_LCT_NOTIFY, ");
+		break;
+	case I2O_CMD_OUTBOUND_INIT:
+		printk("EXEC_OUTBOUND_INIT, ");
+		break;
+	case I2O_CMD_PATH_ENABLE:
+		printk("EXEC_PATH_ENABLE, ");
+		break;
+	case I2O_CMD_PATH_QUIESCE:
+		printk("EXEC_PATH_QUIESCE, ");
+		break;
+	case I2O_CMD_PATH_RESET:
+		printk("EXEC_PATH_RESET, ");
+		break;
+	case I2O_CMD_STATIC_MF_CREATE:
+		printk("EXEC_STATIC_MF_CREATE, ");
+		break;
+	case I2O_CMD_STATIC_MF_RELEASE:
+		printk("EXEC_STATIC_MF_RELEASE, ");
+		break;
+	case I2O_CMD_STATUS_GET:
+		printk("EXEC_STATUS_GET, ");
+		break;
+	case I2O_CMD_SW_DOWNLOAD:
+		printk("EXEC_SW_DOWNLOAD, ");
+		break;
+	case I2O_CMD_SW_UPLOAD:
+		printk("EXEC_SW_UPLOAD, ");
+		break;
+	case I2O_CMD_SW_REMOVE:
+		printk("EXEC_SW_REMOVE, ");
+		break;
+	case I2O_CMD_SYS_ENABLE:
+		printk("EXEC_SYS_ENABLE, ");
+		break;
+	case I2O_CMD_SYS_MODIFY:
+		printk("EXEC_SYS_MODIFY, ");
+		break;
+	case I2O_CMD_SYS_QUIESCE:
+		printk("EXEC_SYS_QUIESCE, ");
+		break;
+	case I2O_CMD_SYS_TAB_SET:
+		printk("EXEC_SYS_TAB_SET, ");
+		break;
+	default:
+		printk("Cmd = %#02x, ", cmd);
+	}
+}
+
+void i2o_debug_state(struct i2o_controller *c)
+{
+	printk(KERN_INFO "%s: State = ", c->name);
+	switch (((i2o_status_block *) c->status_block.virt)->iop_state) {
+	case 0x01:
+		printk("INIT\n");
+		break;
+	case 0x02:
+		printk("RESET\n");
+		break;
+	case 0x04:
+		printk("HOLD\n");
+		break;
+	case 0x05:
+		printk("READY\n");
+		break;
+	case 0x08:
+		printk("OPERATIONAL\n");
+		break;
+	case 0x10:
+		printk("FAILED\n");
+		break;
+	case 0x11:
+		printk("FAULTED\n");
+		break;
+	default:
+		printk("%x (unknown !!)\n",
+		       ((i2o_status_block *) c->status_block.virt)->iop_state);
+	}
+};
+
+void i2o_dump_hrt(struct i2o_controller *c)
+{
+	u32 *rows = (u32 *) c->hrt.virt;
+	u8 *p = (u8 *) c->hrt.virt;
+	u8 *d;
+	int count;
+	int length;
+	int i;
+	int state;
+
+	if (p[3] != 0) {
+		printk(KERN_ERR
+		       "%s: HRT table for controller is too new a version.\n",
+		       c->name);
+		return;
+	}
+
+	count = p[0] | (p[1] << 8);
+	length = p[2];
+
+	printk(KERN_INFO "%s: HRT has %d entries of %d bytes each.\n",
+	       c->name, count, length << 2);
+
+	rows += 2;
+
+	for (i = 0; i < count; i++) {
+		printk(KERN_INFO "Adapter %08X: ", rows[0]);
+		p = (u8 *) (rows + 1);
+		d = (u8 *) (rows + 2);
+		state = p[1] << 8 | p[0];
+
+		printk("TID %04X:[", state & 0xFFF);
+		state >>= 12;
+		if (state & (1 << 0))
+			printk("H");	/* Hidden */
+		if (state & (1 << 2)) {
+			printk("P");	/* Present */
+			if (state & (1 << 1))
+				printk("C");	/* Controlled */
+		}
+		if (state > 9)
+			printk("*");	/* Hard */
+
+		printk("]:");
+
+		switch (p[3] & 0xFFFF) {
+		case 0:
+			/* Adapter private bus - easy */
+			printk("Local bus %d: I/O at 0x%04X Mem 0x%08X", p[2],
+			       d[1] << 8 | d[0], *(u32 *) (d + 4));
+			break;
+		case 1:
+			/* ISA bus */
+			printk("ISA %d: CSN %d I/O at 0x%04X Mem 0x%08X", p[2],
+			       d[2], d[1] << 8 | d[0], *(u32 *) (d + 4));
+			break;
+
+		case 2:	/* EISA bus */
+			printk("EISA %d: Slot %d I/O at 0x%04X Mem 0x%08X",
+			       p[2], d[3], d[1] << 8 | d[0], *(u32 *) (d + 4));
+			break;
+
+		case 3:	/* MCA bus */
+			printk("MCA %d: Slot %d I/O at 0x%04X Mem 0x%08X", p[2],
+			       d[3], d[1] << 8 | d[0], *(u32 *) (d + 4));
+			break;
+
+		case 4:	/* PCI bus */
+			printk("PCI %d: Bus %d Device %d Function %d", p[2],
+			       d[2], d[1], d[0]);
+			break;
+
+		case 0x80:	/* Other */
+		default:
+			printk("Unsupported bus type.");
+			break;
+		}
+		printk("\n");
+		rows += length;
+	}
+}
+
+EXPORT_SYMBOL(i2o_dump_message);
diff --git a/drivers/staging/i2o/device.c b/drivers/staging/i2o/device.c
new file mode 100644
index 000000000000..2af22553dd4e
--- /dev/null
+++ b/drivers/staging/i2o/device.c
@@ -0,0 +1,594 @@
+/*
+ *	Functions to handle I2O devices
+ *
+ *	Copyright (C) 2004	Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	Fixes/additions:
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			initial version.
+ */
+
+#include <linux/module.h>
+#include "i2o.h"
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "core.h"
+
+/**
+ *	i2o_device_issue_claim - claim or release a device
+ *	@dev: I2O device to claim or release
+ *	@cmd: claim or release command
+ *	@type: type of claim
+ *
+ *	Issue I2O UTIL_CLAIM or UTIL_RELEASE messages. The message to be sent
+ *	is set by cmd. dev is the I2O device which should be claim or
+ *	released and the type is the claim type (see the I2O spec).
+ *
+ *	Returs 0 on success or negative error code on failure.
+ */
+static inline int i2o_device_issue_claim(struct i2o_device *dev, u32 cmd,
+					 u32 type)
+{
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(cmd << 24 | HOST_TID << 12 | dev->lct_data.tid);
+	msg->body[0] = cpu_to_le32(type);
+
+	return i2o_msg_post_wait(dev->iop, msg, 60);
+}
+
+/**
+ *	i2o_device_claim - claim a device for use by an OSM
+ *	@dev: I2O device to claim
+ *
+ *	Do the leg work to assign a device to a given OSM. If the claim succeeds,
+ *	the owner is the primary. If the attempt fails a negative errno code
+ *	is returned. On success zero is returned.
+ */
+int i2o_device_claim(struct i2o_device *dev)
+{
+	int rc = 0;
+
+	mutex_lock(&dev->lock);
+
+	rc = i2o_device_issue_claim(dev, I2O_CMD_UTIL_CLAIM, I2O_CLAIM_PRIMARY);
+	if (!rc)
+		pr_debug("i2o: claim of device %d succeeded\n",
+			 dev->lct_data.tid);
+	else
+		pr_debug("i2o: claim of device %d failed %d\n",
+			 dev->lct_data.tid, rc);
+
+	mutex_unlock(&dev->lock);
+
+	return rc;
+}
+
+/**
+ *	i2o_device_claim_release - release a device that the OSM is using
+ *	@dev: device to release
+ *
+ *	Drop a claim by an OSM on a given I2O device.
+ *
+ *	AC - some devices seem to want to refuse an unclaim until they have
+ *	finished internal processing. It makes sense since you don't want a
+ *	new device to go reconfiguring the entire system until you are done.
+ *	Thus we are prepared to wait briefly.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_device_claim_release(struct i2o_device *dev)
+{
+	int tries;
+	int rc = 0;
+
+	mutex_lock(&dev->lock);
+
+	/*
+	 *      If the controller takes a nonblocking approach to
+	 *      releases we have to sleep/poll for a few times.
+	 */
+	for (tries = 0; tries < 10; tries++) {
+		rc = i2o_device_issue_claim(dev, I2O_CMD_UTIL_RELEASE,
+					    I2O_CLAIM_PRIMARY);
+		if (!rc)
+			break;
+
+		ssleep(1);
+	}
+
+	if (!rc)
+		pr_debug("i2o: claim release of device %d succeeded\n",
+			 dev->lct_data.tid);
+	else
+		pr_debug("i2o: claim release of device %d failed %d\n",
+			 dev->lct_data.tid, rc);
+
+	mutex_unlock(&dev->lock);
+
+	return rc;
+}
+
+/**
+ *	i2o_device_release - release the memory for a I2O device
+ *	@dev: I2O device which should be released
+ *
+ *	Release the allocated memory. This function is called if refcount of
+ *	device reaches 0 automatically.
+ */
+static void i2o_device_release(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+
+	pr_debug("i2o: device %s released\n", dev_name(dev));
+
+	kfree(i2o_dev);
+}
+
+/**
+ *	class_id_show - Displays class id of I2O device
+ *	@dev: device of which the class id should be displayed
+ *	@attr: pointer to device attribute
+ *	@buf: buffer into which the class id should be printed
+ *
+ *	Returns the number of bytes which are printed into the buffer.
+ */
+static ssize_t class_id_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+
+	sprintf(buf, "0x%03x\n", i2o_dev->lct_data.class_id);
+	return strlen(buf) + 1;
+}
+static DEVICE_ATTR_RO(class_id);
+
+/**
+ *	tid_show - Displays TID of I2O device
+ *	@dev: device of which the TID should be displayed
+ *	@attr: pointer to device attribute
+ *	@buf: buffer into which the TID should be printed
+ *
+ *	Returns the number of bytes which are printed into the buffer.
+ */
+static ssize_t tid_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+
+	sprintf(buf, "0x%03x\n", i2o_dev->lct_data.tid);
+	return strlen(buf) + 1;
+}
+static DEVICE_ATTR_RO(tid);
+
+/* I2O device attributes */
+static struct attribute *i2o_device_attrs[] = {
+	&dev_attr_class_id.attr,
+	&dev_attr_tid.attr,
+	NULL,
+};
+
+static const struct attribute_group i2o_device_group = {
+	.attrs = i2o_device_attrs,
+};
+
+const struct attribute_group *i2o_device_groups[] = {
+	&i2o_device_group,
+	NULL,
+};
+
+/**
+ *	i2o_device_alloc - Allocate a I2O device and initialize it
+ *
+ *	Allocate the memory for a I2O device and initialize locks and lists
+ *
+ *	Returns the allocated I2O device or a negative error code if the device
+ *	could not be allocated.
+ */
+static struct i2o_device *i2o_device_alloc(void)
+{
+	struct i2o_device *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&dev->list);
+	mutex_init(&dev->lock);
+
+	dev->device.bus = &i2o_bus_type;
+	dev->device.release = &i2o_device_release;
+
+	return dev;
+}
+
+/**
+ *	i2o_device_add - allocate a new I2O device and add it to the IOP
+ *	@c: I2O controller that the device is on
+ *	@entry: LCT entry of the I2O device
+ *
+ *	Allocate a new I2O device and initialize it with the LCT entry. The
+ *	device is appended to the device list of the controller.
+ *
+ *	Returns zero on success, or a -ve errno.
+ */
+static int i2o_device_add(struct i2o_controller *c, i2o_lct_entry *entry)
+{
+	struct i2o_device *i2o_dev, *tmp;
+	int rc;
+
+	i2o_dev = i2o_device_alloc();
+	if (IS_ERR(i2o_dev)) {
+		printk(KERN_ERR "i2o: unable to allocate i2o device\n");
+		return PTR_ERR(i2o_dev);
+	}
+
+	i2o_dev->lct_data = *entry;
+
+	dev_set_name(&i2o_dev->device, "%d:%03x", c->unit,
+		     i2o_dev->lct_data.tid);
+
+	i2o_dev->iop = c;
+	i2o_dev->device.parent = &c->device;
+
+	rc = device_register(&i2o_dev->device);
+	if (rc)
+		goto err;
+
+	list_add_tail(&i2o_dev->list, &c->devices);
+
+	/* create user entries for this device */
+	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.user_tid);
+	if (tmp && (tmp != i2o_dev)) {
+		rc = sysfs_create_link(&i2o_dev->device.kobj,
+				       &tmp->device.kobj, "user");
+		if (rc)
+			goto unreg_dev;
+	}
+
+	/* create user entries referring to this device */
+	list_for_each_entry(tmp, &c->devices, list)
+	    if ((tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
+		&& (tmp != i2o_dev)) {
+		rc = sysfs_create_link(&tmp->device.kobj,
+				       &i2o_dev->device.kobj, "user");
+		if (rc)
+			goto rmlink1;
+	}
+
+	/* create parent entries for this device */
+	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.parent_tid);
+	if (tmp && (tmp != i2o_dev)) {
+		rc = sysfs_create_link(&i2o_dev->device.kobj,
+				       &tmp->device.kobj, "parent");
+		if (rc)
+			goto rmlink1;
+	}
+
+	/* create parent entries referring to this device */
+	list_for_each_entry(tmp, &c->devices, list)
+	    if ((tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
+		&& (tmp != i2o_dev)) {
+		rc = sysfs_create_link(&tmp->device.kobj,
+				       &i2o_dev->device.kobj, "parent");
+		if (rc)
+			goto rmlink2;
+	}
+
+	i2o_driver_notify_device_add_all(i2o_dev);
+
+	pr_debug("i2o: device %s added\n", dev_name(&i2o_dev->device));
+
+	return 0;
+
+rmlink2:
+	/* If link creating failed halfway, we loop whole list to cleanup.
+	 * And we don't care wrong removing of link, because sysfs_remove_link
+	 * will take care of it.
+	 */
+	list_for_each_entry(tmp, &c->devices, list) {
+		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
+			sysfs_remove_link(&tmp->device.kobj, "parent");
+	}
+	sysfs_remove_link(&i2o_dev->device.kobj, "parent");
+rmlink1:
+	list_for_each_entry(tmp, &c->devices, list)
+		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
+			sysfs_remove_link(&tmp->device.kobj, "user");
+	sysfs_remove_link(&i2o_dev->device.kobj, "user");
+unreg_dev:
+	list_del(&i2o_dev->list);
+	device_unregister(&i2o_dev->device);
+err:
+	kfree(i2o_dev);
+	return rc;
+}
+
+/**
+ *	i2o_device_remove - remove an I2O device from the I2O core
+ *	@i2o_dev: I2O device which should be released
+ *
+ *	Is used on I2O controller removal or LCT modification, when the device
+ *	is removed from the system. Note that the device could still hang
+ *	around until the refcount reaches 0.
+ */
+void i2o_device_remove(struct i2o_device *i2o_dev)
+{
+	struct i2o_device *tmp;
+	struct i2o_controller *c = i2o_dev->iop;
+
+	i2o_driver_notify_device_remove_all(i2o_dev);
+
+	sysfs_remove_link(&i2o_dev->device.kobj, "parent");
+	sysfs_remove_link(&i2o_dev->device.kobj, "user");
+
+	list_for_each_entry(tmp, &c->devices, list) {
+		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
+			sysfs_remove_link(&tmp->device.kobj, "parent");
+		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
+			sysfs_remove_link(&tmp->device.kobj, "user");
+	}
+	list_del(&i2o_dev->list);
+
+	device_unregister(&i2o_dev->device);
+}
+
+/**
+ *	i2o_device_parse_lct - Parse a previously fetched LCT and create devices
+ *	@c: I2O controller from which the LCT should be parsed.
+ *
+ *	The Logical Configuration Table tells us what we can talk to on the
+ *	board. For every entry we create an I2O device, which is registered in
+ *	the I2O core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_device_parse_lct(struct i2o_controller *c)
+{
+	struct i2o_device *dev, *tmp;
+	i2o_lct *lct;
+	u32 *dlct = c->dlct.virt;
+	int max = 0, i = 0;
+	u16 table_size;
+	u32 buf;
+
+	mutex_lock(&c->lct_lock);
+
+	kfree(c->lct);
+
+	buf = le32_to_cpu(*dlct++);
+	table_size = buf & 0xffff;
+
+	lct = c->lct = kmalloc(table_size * 4, GFP_KERNEL);
+	if (!lct) {
+		mutex_unlock(&c->lct_lock);
+		return -ENOMEM;
+	}
+
+	lct->lct_ver = buf >> 28;
+	lct->boot_tid = buf >> 16 & 0xfff;
+	lct->table_size = table_size;
+	lct->change_ind = le32_to_cpu(*dlct++);
+	lct->iop_flags = le32_to_cpu(*dlct++);
+
+	table_size -= 3;
+
+	pr_debug("%s: LCT has %d entries (LCT size: %d)\n", c->name, max,
+		 lct->table_size);
+
+	while (table_size > 0) {
+		i2o_lct_entry *entry = &lct->lct_entry[max];
+		int found = 0;
+
+		buf = le32_to_cpu(*dlct++);
+		entry->entry_size = buf & 0xffff;
+		entry->tid = buf >> 16 & 0xfff;
+
+		entry->change_ind = le32_to_cpu(*dlct++);
+		entry->device_flags = le32_to_cpu(*dlct++);
+
+		buf = le32_to_cpu(*dlct++);
+		entry->class_id = buf & 0xfff;
+		entry->version = buf >> 12 & 0xf;
+		entry->vendor_id = buf >> 16;
+
+		entry->sub_class = le32_to_cpu(*dlct++);
+
+		buf = le32_to_cpu(*dlct++);
+		entry->user_tid = buf & 0xfff;
+		entry->parent_tid = buf >> 12 & 0xfff;
+		entry->bios_info = buf >> 24;
+
+		memcpy(&entry->identity_tag, dlct, 8);
+		dlct += 2;
+
+		entry->event_capabilities = le32_to_cpu(*dlct++);
+
+		/* add new devices, which are new in the LCT */
+		list_for_each_entry_safe(dev, tmp, &c->devices, list) {
+			if (entry->tid == dev->lct_data.tid) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (!found)
+			i2o_device_add(c, entry);
+
+		table_size -= 9;
+		max++;
+	}
+
+	/* remove devices, which are not in the LCT anymore */
+	list_for_each_entry_safe(dev, tmp, &c->devices, list) {
+		int found = 0;
+
+		for (i = 0; i < max; i++) {
+			if (lct->lct_entry[i].tid == dev->lct_data.tid) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (!found)
+			i2o_device_remove(dev);
+	}
+
+	mutex_unlock(&c->lct_lock);
+
+	return 0;
+}
+
+/*
+ *	Run time support routines
+ */
+
+/*	Issue UTIL_PARAMS_GET or UTIL_PARAMS_SET
+ *
+ *	This function can be used for all UtilParamsGet/Set operations.
+ *	The OperationList is given in oplist-buffer,
+ *	and results are returned in reslist-buffer.
+ *	Note that the minimum sized reslist is 8 bytes and contains
+ *	ResultCount, ErrorInfoSize, BlockStatus and BlockSize.
+ */
+int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
+		   int oplen, void *reslist, int reslen)
+{
+	struct i2o_message *msg;
+	int i = 0;
+	int rc;
+	struct i2o_dma res;
+	struct i2o_controller *c = i2o_dev->iop;
+	struct device *dev = &c->pdev->dev;
+
+	res.virt = NULL;
+
+	if (i2o_dma_alloc(dev, &res, reslen))
+		return -ENOMEM;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg)) {
+		i2o_dma_free(dev, &res);
+		return PTR_ERR(msg);
+	}
+
+	i = 0;
+	msg->u.head[1] =
+	    cpu_to_le32(cmd << 24 | HOST_TID << 12 | i2o_dev->lct_data.tid);
+	msg->body[i++] = cpu_to_le32(0x00000000);
+	msg->body[i++] = cpu_to_le32(0x4C000000 | oplen);	/* OperationList */
+	memcpy(&msg->body[i], oplist, oplen);
+	i += (oplen / 4 + (oplen % 4 ? 1 : 0));
+	msg->body[i++] = cpu_to_le32(0xD0000000 | res.len);	/* ResultList */
+	msg->body[i++] = cpu_to_le32(res.phys);
+
+	msg->u.head[0] =
+	    cpu_to_le32(I2O_MESSAGE_SIZE(i + sizeof(struct i2o_message) / 4) |
+			SGL_OFFSET_5);
+
+	rc = i2o_msg_post_wait_mem(c, msg, 10, &res);
+
+	/* This only looks like a memory leak - don't "fix" it. */
+	if (rc == -ETIMEDOUT)
+		return rc;
+
+	memcpy(reslist, res.virt, res.len);
+	i2o_dma_free(dev, &res);
+
+	return rc;
+}
+
+/*
+ *	 Query one field group value or a whole scalar group.
+ */
+int i2o_parm_field_get(struct i2o_device *i2o_dev, int group, int field,
+		       void *buf, int buflen)
+{
+	u32 opblk[] = { cpu_to_le32(0x00000001),
+		cpu_to_le32((u16) group << 16 | I2O_PARAMS_FIELD_GET),
+		cpu_to_le32((s16) field << 16 | 0x00000001)
+	};
+	u8 *resblk;		/* 8 bytes for header */
+	int rc;
+
+	resblk = kmalloc(buflen + 8, GFP_KERNEL);
+	if (!resblk)
+		return -ENOMEM;
+
+	rc = i2o_parm_issue(i2o_dev, I2O_CMD_UTIL_PARAMS_GET, opblk,
+			    sizeof(opblk), resblk, buflen + 8);
+
+	memcpy(buf, resblk + 8, buflen);	/* cut off header */
+
+	kfree(resblk);
+
+	return rc;
+}
+
+/*
+ *	if oper == I2O_PARAMS_TABLE_GET, get from all rows
+ *		if fieldcount == -1 return all fields
+ *			ibuf and ibuflen are unused (use NULL, 0)
+ *		else return specific fields
+ *			ibuf contains fieldindexes
+ *
+ *	if oper == I2O_PARAMS_LIST_GET, get from specific rows
+ *		if fieldcount == -1 return all fields
+ *			ibuf contains rowcount, keyvalues
+ *		else return specific fields
+ *			fieldcount is # of fieldindexes
+ *			ibuf contains fieldindexes, rowcount, keyvalues
+ *
+ *	You could also use directly function i2o_issue_params().
+ */
+int i2o_parm_table_get(struct i2o_device *dev, int oper, int group,
+		       int fieldcount, void *ibuf, int ibuflen, void *resblk,
+		       int reslen)
+{
+	u16 *opblk;
+	int size;
+
+	size = 10 + ibuflen;
+	if (size % 4)
+		size += 4 - size % 4;
+
+	opblk = kmalloc(size, GFP_KERNEL);
+	if (opblk == NULL) {
+		printk(KERN_ERR "i2o: no memory for query buffer.\n");
+		return -ENOMEM;
+	}
+
+	opblk[0] = 1;		/* operation count */
+	opblk[1] = 0;		/* pad */
+	opblk[2] = oper;
+	opblk[3] = group;
+	opblk[4] = fieldcount;
+	memcpy(opblk + 5, ibuf, ibuflen);	/* other params */
+
+	size = i2o_parm_issue(dev, I2O_CMD_UTIL_PARAMS_GET, opblk,
+			      size, resblk, reslen);
+
+	kfree(opblk);
+	if (size > reslen)
+		return reslen;
+
+	return size;
+}
+
+EXPORT_SYMBOL(i2o_device_claim);
+EXPORT_SYMBOL(i2o_device_claim_release);
+EXPORT_SYMBOL(i2o_parm_field_get);
+EXPORT_SYMBOL(i2o_parm_table_get);
+EXPORT_SYMBOL(i2o_parm_issue);
diff --git a/drivers/staging/i2o/driver.c b/drivers/staging/i2o/driver.c
new file mode 100644
index 000000000000..111c3edde035
--- /dev/null
+++ b/drivers/staging/i2o/driver.c
@@ -0,0 +1,382 @@
+/*
+ *	Functions to handle I2O drivers (OSMs) and I2O bus type for sysfs
+ *
+ *	Copyright (C) 2004	Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	Fixes/additions:
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			initial version.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/rwsem.h>
+#include "i2o.h"
+#include <linux/workqueue.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "core.h"
+
+#define OSM_NAME	"i2o"
+
+/* max_drivers - Maximum I2O drivers (OSMs) which could be registered */
+static unsigned int i2o_max_drivers = I2O_MAX_DRIVERS;
+module_param_named(max_drivers, i2o_max_drivers, uint, 0);
+MODULE_PARM_DESC(max_drivers, "maximum number of OSM's to support");
+
+/* I2O drivers lock and array */
+static spinlock_t i2o_drivers_lock;
+static struct i2o_driver **i2o_drivers;
+
+/**
+ *	i2o_bus_match - Tell if I2O device class id matches the class ids of the I2O driver (OSM)
+ *	@dev: device which should be verified
+ *	@drv: the driver to match against
+ *
+ *	Used by the bus to check if the driver wants to handle the device.
+ *
+ *	Returns 1 if the class ids of the driver match the class id of the
+ *	device, otherwise 0.
+ */
+static int i2o_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+	struct i2o_driver *i2o_drv = to_i2o_driver(drv);
+	struct i2o_class_id *ids = i2o_drv->classes;
+
+	if (ids)
+		while (ids->class_id != I2O_CLASS_END) {
+			if (ids->class_id == i2o_dev->lct_data.class_id)
+				return 1;
+			ids++;
+		}
+	return 0;
+};
+
+/* I2O bus type */
+struct bus_type i2o_bus_type = {
+	.name = "i2o",
+	.match = i2o_bus_match,
+	.dev_groups = i2o_device_groups,
+};
+
+/**
+ *	i2o_driver_register - Register a I2O driver (OSM) in the I2O core
+ *	@drv: I2O driver which should be registered
+ *
+ *	Registers the OSM drv in the I2O core and creates an event queues if
+ *	necessary.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_driver_register(struct i2o_driver *drv)
+{
+	struct i2o_controller *c;
+	int i;
+	int rc = 0;
+	unsigned long flags;
+
+	osm_debug("Register driver %s\n", drv->name);
+
+	if (drv->event) {
+		drv->event_queue = alloc_workqueue("%s", WQ_MEM_RECLAIM, 1,
+						   drv->name);
+		if (!drv->event_queue) {
+			osm_err("Could not initialize event queue for driver "
+				"%s\n", drv->name);
+			return -EFAULT;
+		}
+		osm_debug("Event queue initialized for driver %s\n", drv->name);
+	} else
+		drv->event_queue = NULL;
+
+	drv->driver.name = drv->name;
+	drv->driver.bus = &i2o_bus_type;
+
+	spin_lock_irqsave(&i2o_drivers_lock, flags);
+
+	for (i = 0; i2o_drivers[i]; i++)
+		if (i >= i2o_max_drivers) {
+			osm_err("too many drivers registered, increase "
+				"max_drivers\n");
+			spin_unlock_irqrestore(&i2o_drivers_lock, flags);
+			rc = -EFAULT;
+			goto out;
+		}
+
+	drv->context = i;
+	i2o_drivers[i] = drv;
+
+	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
+
+	osm_debug("driver %s gets context id %d\n", drv->name, drv->context);
+
+	list_for_each_entry(c, &i2o_controllers, list) {
+		struct i2o_device *i2o_dev;
+
+		i2o_driver_notify_controller_add(drv, c);
+		list_for_each_entry(i2o_dev, &c->devices, list)
+		    i2o_driver_notify_device_add(drv, i2o_dev);
+	}
+
+	rc = driver_register(&drv->driver);
+	if (rc)
+		goto out;
+
+	return 0;
+out:
+	if (drv->event_queue) {
+		destroy_workqueue(drv->event_queue);
+		drv->event_queue = NULL;
+	}
+
+	return rc;
+};
+
+/**
+ *	i2o_driver_unregister - Unregister a I2O driver (OSM) from the I2O core
+ *	@drv: I2O driver which should be unregistered
+ *
+ *	Unregisters the OSM drv from the I2O core and cleanup event queues if
+ *	necessary.
+ */
+void i2o_driver_unregister(struct i2o_driver *drv)
+{
+	struct i2o_controller *c;
+	unsigned long flags;
+
+	osm_debug("unregister driver %s\n", drv->name);
+
+	driver_unregister(&drv->driver);
+
+	list_for_each_entry(c, &i2o_controllers, list) {
+		struct i2o_device *i2o_dev;
+
+		list_for_each_entry(i2o_dev, &c->devices, list)
+		    i2o_driver_notify_device_remove(drv, i2o_dev);
+
+		i2o_driver_notify_controller_remove(drv, c);
+	}
+
+	spin_lock_irqsave(&i2o_drivers_lock, flags);
+	i2o_drivers[drv->context] = NULL;
+	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
+
+	if (drv->event_queue) {
+		destroy_workqueue(drv->event_queue);
+		drv->event_queue = NULL;
+		osm_debug("event queue removed for %s\n", drv->name);
+	}
+};
+
+/**
+ *	i2o_driver_dispatch - dispatch an I2O reply message
+ *	@c: I2O controller of the message
+ *	@m: I2O message number
+ *
+ *	The reply is delivered to the driver from which the original message
+ *	was. This function is only called from interrupt context.
+ *
+ *	Returns 0 on success and the message should not be flushed. Returns > 0
+ *	on success and if the message should be flushed afterwords. Returns
+ *	negative error code on failure (the message will be flushed too).
+ */
+int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
+{
+	struct i2o_driver *drv;
+	struct i2o_message *msg = i2o_msg_out_to_virt(c, m);
+	u32 context = le32_to_cpu(msg->u.s.icntxt);
+	unsigned long flags;
+
+	if (unlikely(context >= i2o_max_drivers)) {
+		osm_warn("%s: Spurious reply to unknown driver %d\n", c->name,
+			 context);
+		return -EIO;
+	}
+
+	spin_lock_irqsave(&i2o_drivers_lock, flags);
+	drv = i2o_drivers[context];
+	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
+
+	if (unlikely(!drv)) {
+		osm_warn("%s: Spurious reply to unknown driver %d\n", c->name,
+			 context);
+		return -EIO;
+	}
+
+	if ((le32_to_cpu(msg->u.head[1]) >> 24) == I2O_CMD_UTIL_EVT_REGISTER) {
+		struct i2o_device *dev, *tmp;
+		struct i2o_event *evt;
+		u16 size;
+		u16 tid = le32_to_cpu(msg->u.head[1]) & 0xfff;
+
+		osm_debug("event received from device %d\n", tid);
+
+		if (!drv->event)
+			return -EIO;
+
+		/* cut of header from message size (in 32-bit words) */
+		size = (le32_to_cpu(msg->u.head[0]) >> 16) - 5;
+
+		evt = kzalloc(size * 4 + sizeof(*evt), GFP_ATOMIC);
+		if (!evt)
+			return -ENOMEM;
+
+		evt->size = size;
+		evt->tcntxt = le32_to_cpu(msg->u.s.tcntxt);
+		evt->event_indicator = le32_to_cpu(msg->body[0]);
+		memcpy(&evt->data, &msg->body[1], size * 4);
+
+		list_for_each_entry_safe(dev, tmp, &c->devices, list)
+		    if (dev->lct_data.tid == tid) {
+			evt->i2o_dev = dev;
+			break;
+		}
+
+		INIT_WORK(&evt->work, drv->event);
+		queue_work(drv->event_queue, &evt->work);
+		return 1;
+	}
+
+	if (unlikely(!drv->reply)) {
+		osm_debug("%s: Reply to driver %s, but no reply function"
+			  " defined!\n", c->name, drv->name);
+		return -EIO;
+	}
+
+	return drv->reply(c, m, msg);
+}
+
+/**
+ *	i2o_driver_notify_controller_add_all - Send notify of added controller
+ *	@c: newly added controller
+ *
+ *	Send notifications to all registered drivers that a new controller was
+ *	added.
+ */
+void i2o_driver_notify_controller_add_all(struct i2o_controller *c)
+{
+	int i;
+	struct i2o_driver *drv;
+
+	for (i = 0; i < i2o_max_drivers; i++) {
+		drv = i2o_drivers[i];
+
+		if (drv)
+			i2o_driver_notify_controller_add(drv, c);
+	}
+}
+
+/**
+ *	i2o_driver_notify_controller_remove_all - Send notify of removed controller
+ *	@c: controller that is being removed
+ *
+ *	Send notifications to all registered drivers that a controller was
+ *	removed.
+ */
+void i2o_driver_notify_controller_remove_all(struct i2o_controller *c)
+{
+	int i;
+	struct i2o_driver *drv;
+
+	for (i = 0; i < i2o_max_drivers; i++) {
+		drv = i2o_drivers[i];
+
+		if (drv)
+			i2o_driver_notify_controller_remove(drv, c);
+	}
+}
+
+/**
+ *	i2o_driver_notify_device_add_all - Send notify of added device
+ *	@i2o_dev: newly added I2O device
+ *
+ *	Send notifications to all registered drivers that a device was added.
+ */
+void i2o_driver_notify_device_add_all(struct i2o_device *i2o_dev)
+{
+	int i;
+	struct i2o_driver *drv;
+
+	for (i = 0; i < i2o_max_drivers; i++) {
+		drv = i2o_drivers[i];
+
+		if (drv)
+			i2o_driver_notify_device_add(drv, i2o_dev);
+	}
+}
+
+/**
+ *	i2o_driver_notify_device_remove_all - Send notify of removed device
+ *	@i2o_dev: device that is being removed
+ *
+ *	Send notifications to all registered drivers that a device was removed.
+ */
+void i2o_driver_notify_device_remove_all(struct i2o_device *i2o_dev)
+{
+	int i;
+	struct i2o_driver *drv;
+
+	for (i = 0; i < i2o_max_drivers; i++) {
+		drv = i2o_drivers[i];
+
+		if (drv)
+			i2o_driver_notify_device_remove(drv, i2o_dev);
+	}
+}
+
+/**
+ *	i2o_driver_init - initialize I2O drivers (OSMs)
+ *
+ *	Registers the I2O bus and allocate memory for the array of OSMs.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int __init i2o_driver_init(void)
+{
+	int rc = 0;
+
+	spin_lock_init(&i2o_drivers_lock);
+
+	if ((i2o_max_drivers < 2) || (i2o_max_drivers > 64)) {
+		osm_warn("max_drivers set to %d, but must be >=2 and <= 64\n",
+			 i2o_max_drivers);
+		i2o_max_drivers = I2O_MAX_DRIVERS;
+	}
+	osm_info("max drivers = %d\n", i2o_max_drivers);
+
+	i2o_drivers =
+	    kcalloc(i2o_max_drivers, sizeof(*i2o_drivers), GFP_KERNEL);
+	if (!i2o_drivers)
+		return -ENOMEM;
+
+	rc = bus_register(&i2o_bus_type);
+
+	if (rc < 0)
+		kfree(i2o_drivers);
+
+	return rc;
+};
+
+/**
+ *	i2o_driver_exit - clean up I2O drivers (OSMs)
+ *
+ *	Unregisters the I2O bus and frees driver array.
+ */
+void i2o_driver_exit(void)
+{
+	bus_unregister(&i2o_bus_type);
+	kfree(i2o_drivers);
+};
+
+EXPORT_SYMBOL(i2o_driver_register);
+EXPORT_SYMBOL(i2o_driver_unregister);
+EXPORT_SYMBOL(i2o_driver_notify_controller_add_all);
+EXPORT_SYMBOL(i2o_driver_notify_controller_remove_all);
+EXPORT_SYMBOL(i2o_driver_notify_device_add_all);
+EXPORT_SYMBOL(i2o_driver_notify_device_remove_all);
diff --git a/drivers/staging/i2o/exec-osm.c b/drivers/staging/i2o/exec-osm.c
new file mode 100644
index 000000000000..16d857d5e655
--- /dev/null
+++ b/drivers/staging/i2o/exec-osm.c
@@ -0,0 +1,612 @@
+/*
+ *	Executive OSM
+ *
+ * 	Copyright (C) 1999-2002	Red Hat Software
+ *
+ *	Written by Alan Cox, Building Number Three Ltd
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	A lot of the I2O message side code from this is taken from the Red
+ *	Creek RCPCI45 adapter driver by Red Creek Communications
+ *
+ *	Fixes/additions:
+ *		Philipp Rumpf
+ *		Juha Sievänen <Juha.Sievanen@cs.Helsinki.FI>
+ *		Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
+ *		Deepak Saxena <deepak@plexity.net>
+ *		Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>:
+ *			Ported to Linux 2.5.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Minor fixes for 2.6.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Support for sysfs included.
+ */
+
+#include <linux/module.h>
+#include "i2o.h"
+#include <linux/delay.h>
+#include <linux/workqueue.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/sched.h>	/* wait_event_interruptible_timeout() needs this */
+#include <asm/param.h>		/* HZ */
+#include "core.h"
+
+#define OSM_NAME "exec-osm"
+
+struct i2o_driver i2o_exec_driver;
+
+/* global wait list for POST WAIT */
+static LIST_HEAD(i2o_exec_wait_list);
+
+/* Wait struct needed for POST WAIT */
+struct i2o_exec_wait {
+	wait_queue_head_t *wq;	/* Pointer to Wait queue */
+	struct i2o_dma dma;	/* DMA buffers to free on failure */
+	u32 tcntxt;		/* transaction context from reply */
+	int complete;		/* 1 if reply received otherwise 0 */
+	u32 m;			/* message id */
+	struct i2o_message *msg;	/* pointer to the reply message */
+	struct list_head list;	/* node in global wait list */
+	spinlock_t lock;	/* lock before modifying */
+};
+
+/* Work struct needed to handle LCT NOTIFY replies */
+struct i2o_exec_lct_notify_work {
+	struct work_struct work;	/* work struct */
+	struct i2o_controller *c;	/* controller on which the LCT NOTIFY
+					   was received */
+};
+
+/* Exec OSM class handling definition */
+static struct i2o_class_id i2o_exec_class_id[] = {
+	{I2O_CLASS_EXECUTIVE},
+	{I2O_CLASS_END}
+};
+
+/**
+ *	i2o_exec_wait_alloc - Allocate a i2o_exec_wait struct an initialize it
+ *
+ *	Allocate the i2o_exec_wait struct and initialize the wait.
+ *
+ *	Returns i2o_exec_wait pointer on success or negative error code on
+ *	failure.
+ */
+static struct i2o_exec_wait *i2o_exec_wait_alloc(void)
+{
+	struct i2o_exec_wait *wait;
+
+	wait = kzalloc(sizeof(*wait), GFP_KERNEL);
+	if (!wait)
+		return NULL;
+
+	INIT_LIST_HEAD(&wait->list);
+	spin_lock_init(&wait->lock);
+
+	return wait;
+};
+
+/**
+ *	i2o_exec_wait_free - Free an i2o_exec_wait struct
+ *	@wait: I2O wait data which should be cleaned up
+ */
+static void i2o_exec_wait_free(struct i2o_exec_wait *wait)
+{
+	kfree(wait);
+};
+
+/**
+ * 	i2o_msg_post_wait_mem - Post and wait a message with DMA buffers
+ *	@c: controller
+ *	@msg: message to post
+ *	@timeout: time in seconds to wait
+ *	@dma: i2o_dma struct of the DMA buffer to free on failure
+ *
+ * 	This API allows an OSM to post a message and then be told whether or
+ *	not the system received a successful reply. If the message times out
+ *	then the value '-ETIMEDOUT' is returned. This is a special case. In
+ *	this situation the message may (should) complete at an indefinite time
+ *	in the future. When it completes it will use the memory buffer
+ *	attached to the request. If -ETIMEDOUT is returned then the memory
+ *	buffer must not be freed. Instead the event completion will free them
+ *	for you. In all other cases the buffer are your problem.
+ *
+ *	Returns 0 on success, negative error code on timeout or positive error
+ *	code from reply.
+ */
+int i2o_msg_post_wait_mem(struct i2o_controller *c, struct i2o_message *msg,
+			  unsigned long timeout, struct i2o_dma *dma)
+{
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+	struct i2o_exec_wait *wait;
+	static u32 tcntxt = 0x80000000;
+	unsigned long flags;
+	int rc = 0;
+
+	wait = i2o_exec_wait_alloc();
+	if (!wait) {
+		i2o_msg_nop(c, msg);
+		return -ENOMEM;
+	}
+
+	if (tcntxt == 0xffffffff)
+		tcntxt = 0x80000000;
+
+	if (dma)
+		wait->dma = *dma;
+
+	/*
+	 * Fill in the message initiator context and transaction context.
+	 * We will only use transaction contexts >= 0x80000000 for POST WAIT,
+	 * so we could find a POST WAIT reply easier in the reply handler.
+	 */
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	wait->tcntxt = tcntxt++;
+	msg->u.s.tcntxt = cpu_to_le32(wait->tcntxt);
+
+	wait->wq = &wq;
+	/*
+	 * we add elements to the head, because if a entry in the list will
+	 * never be removed, we have to iterate over it every time
+	 */
+	list_add(&wait->list, &i2o_exec_wait_list);
+
+	/*
+	 * Post the message to the controller. At some point later it will
+	 * return. If we time out before it returns then complete will be zero.
+	 */
+	i2o_msg_post(c, msg);
+
+	wait_event_interruptible_timeout(wq, wait->complete, timeout * HZ);
+
+	spin_lock_irqsave(&wait->lock, flags);
+
+	wait->wq = NULL;
+
+	if (wait->complete)
+		rc = le32_to_cpu(wait->msg->body[0]) >> 24;
+	else {
+		/*
+		 * We cannot remove it now. This is important. When it does
+		 * terminate (which it must do if the controller has not
+		 * died...) then it will otherwise scribble on stuff.
+		 *
+		 * FIXME: try abort message
+		 */
+		if (dma)
+			dma->virt = NULL;
+
+		rc = -ETIMEDOUT;
+	}
+
+	spin_unlock_irqrestore(&wait->lock, flags);
+
+	if (rc != -ETIMEDOUT) {
+		i2o_flush_reply(c, wait->m);
+		i2o_exec_wait_free(wait);
+	}
+
+	return rc;
+};
+
+/**
+ *	i2o_msg_post_wait_complete - Reply to a i2o_msg_post request from IOP
+ *	@c: I2O controller which answers
+ *	@m: message id
+ *	@msg: pointer to the I2O reply message
+ *	@context: transaction context of request
+ *
+ *	This function is called in interrupt context only. If the reply reached
+ *	before the timeout, the i2o_exec_wait struct is filled with the message
+ *	and the task will be waked up. The task is now responsible for returning
+ *	the message m back to the controller! If the message reaches us after
+ *	the timeout clean up the i2o_exec_wait struct (including allocated
+ *	DMA buffer).
+ *
+ *	Return 0 on success and if the message m should not be given back to the
+ *	I2O controller, or >0 on success and if the message should be given back
+ *	afterwords. Returns negative error code on failure. In this case the
+ *	message must also be given back to the controller.
+ */
+static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
+				      struct i2o_message *msg, u32 context)
+{
+	struct i2o_exec_wait *wait, *tmp;
+	unsigned long flags;
+	int rc = 1;
+
+	/*
+	 * We need to search through the i2o_exec_wait_list to see if the given
+	 * message is still outstanding. If not, it means that the IOP took
+	 * longer to respond to the message than we had allowed and timer has
+	 * already expired. Not much we can do about that except log it for
+	 * debug purposes, increase timeout, and recompile.
+	 */
+	list_for_each_entry_safe(wait, tmp, &i2o_exec_wait_list, list) {
+		if (wait->tcntxt == context) {
+			spin_lock_irqsave(&wait->lock, flags);
+
+			list_del(&wait->list);
+
+			wait->m = m;
+			wait->msg = msg;
+			wait->complete = 1;
+
+			if (wait->wq)
+				rc = 0;
+			else
+				rc = -1;
+
+			spin_unlock_irqrestore(&wait->lock, flags);
+
+			if (rc) {
+				struct device *dev;
+
+				dev = &c->pdev->dev;
+
+				pr_debug("%s: timedout reply received!\n",
+					 c->name);
+				i2o_dma_free(dev, &wait->dma);
+				i2o_exec_wait_free(wait);
+			} else
+				wake_up_interruptible(wait->wq);
+
+			return rc;
+		}
+	}
+
+	osm_warn("%s: Bogus reply in POST WAIT (tr-context: %08x)!\n", c->name,
+		 context);
+
+	return -1;
+};
+
+/**
+ *	i2o_exec_show_vendor_id - Displays Vendor ID of controller
+ *	@d: device of which the Vendor ID should be displayed
+ *	@attr: device_attribute to display
+ *	@buf: buffer into which the Vendor ID should be printed
+ *
+ *	Returns number of bytes printed into buffer.
+ */
+static ssize_t i2o_exec_show_vendor_id(struct device *d,
+				       struct device_attribute *attr, char *buf)
+{
+	struct i2o_device *dev = to_i2o_device(d);
+	u16 id;
+
+	if (!i2o_parm_field_get(dev, 0x0000, 0, &id, 2)) {
+		sprintf(buf, "0x%04x", le16_to_cpu(id));
+		return strlen(buf) + 1;
+	}
+
+	return 0;
+};
+
+/**
+ *	i2o_exec_show_product_id - Displays Product ID of controller
+ *	@d: device of which the Product ID should be displayed
+ *	@attr: device_attribute to display
+ *	@buf: buffer into which the Product ID should be printed
+ *
+ *	Returns number of bytes printed into buffer.
+ */
+static ssize_t i2o_exec_show_product_id(struct device *d,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct i2o_device *dev = to_i2o_device(d);
+	u16 id;
+
+	if (!i2o_parm_field_get(dev, 0x0000, 1, &id, 2)) {
+		sprintf(buf, "0x%04x", le16_to_cpu(id));
+		return strlen(buf) + 1;
+	}
+
+	return 0;
+};
+
+/* Exec-OSM device attributes */
+static DEVICE_ATTR(vendor_id, S_IRUGO, i2o_exec_show_vendor_id, NULL);
+static DEVICE_ATTR(product_id, S_IRUGO, i2o_exec_show_product_id, NULL);
+
+/**
+ *	i2o_exec_probe - Called if a new I2O device (executive class) appears
+ *	@dev: I2O device which should be probed
+ *
+ *	Registers event notification for every event from Executive device. The
+ *	return is always 0, because we want all devices of class Executive.
+ *
+ *	Returns 0 on success.
+ */
+static int i2o_exec_probe(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+	int rc;
+
+	rc = i2o_event_register(i2o_dev, &i2o_exec_driver, 0, 0xffffffff);
+	if (rc) goto err_out;
+
+	rc = device_create_file(dev, &dev_attr_vendor_id);
+	if (rc) goto err_evtreg;
+	rc = device_create_file(dev, &dev_attr_product_id);
+	if (rc) goto err_vid;
+
+	i2o_dev->iop->exec = i2o_dev;
+
+	return 0;
+
+err_vid:
+	device_remove_file(dev, &dev_attr_vendor_id);
+err_evtreg:
+	i2o_event_register(to_i2o_device(dev), &i2o_exec_driver, 0, 0);
+err_out:
+	return rc;
+};
+
+/**
+ *	i2o_exec_remove - Called on I2O device removal
+ *	@dev: I2O device which was removed
+ *
+ *	Unregisters event notification from Executive I2O device.
+ *
+ *	Returns 0 on success.
+ */
+static int i2o_exec_remove(struct device *dev)
+{
+	device_remove_file(dev, &dev_attr_product_id);
+	device_remove_file(dev, &dev_attr_vendor_id);
+
+	i2o_event_register(to_i2o_device(dev), &i2o_exec_driver, 0, 0);
+
+	return 0;
+};
+
+#ifdef CONFIG_I2O_LCT_NOTIFY_ON_CHANGES
+/**
+ *	i2o_exec_lct_notify - Send a asynchronus LCT NOTIFY request
+ *	@c: I2O controller to which the request should be send
+ *	@change_ind: change indicator
+ *
+ *	This function sends a LCT NOTIFY request to the I2O controller with
+ *	the change indicator change_ind. If the change_ind == 0 the controller
+ *	replies immediately after the request. If change_ind > 0 the reply is
+ *	send after change indicator of the LCT is > change_ind.
+ */
+static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind)
+{
+	i2o_status_block *sb = c->status_block.virt;
+	struct device *dev;
+	struct i2o_message *msg;
+
+	mutex_lock(&c->lct_lock);
+
+	dev = &c->pdev->dev;
+
+	if (i2o_dma_realloc(dev, &c->dlct,
+					le32_to_cpu(sb->expected_lct_size))) {
+		mutex_unlock(&c->lct_lock);
+		return -ENOMEM;
+	}
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg)) {
+		mutex_unlock(&c->lct_lock);
+		return PTR_ERR(msg);
+	}
+
+	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
+	msg->u.head[1] = cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
+				     ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(0xffffffff);
+	msg->body[1] = cpu_to_le32(change_ind);
+	msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
+	msg->body[3] = cpu_to_le32(c->dlct.phys);
+
+	i2o_msg_post(c, msg);
+
+	mutex_unlock(&c->lct_lock);
+
+	return 0;
+}
+#endif
+
+/**
+ *	i2o_exec_lct_modified - Called on LCT NOTIFY reply
+ *	@_work: work struct for a specific controller
+ *
+ *	This function handles asynchronus LCT NOTIFY replies. It parses the
+ *	new LCT and if the buffer for the LCT was to small sends a LCT NOTIFY
+ *	again, otherwise send LCT NOTIFY to get informed on next LCT change.
+ */
+static void i2o_exec_lct_modified(struct work_struct *_work)
+{
+	struct i2o_exec_lct_notify_work *work =
+		container_of(_work, struct i2o_exec_lct_notify_work, work);
+	u32 change_ind = 0;
+	struct i2o_controller *c = work->c;
+
+	kfree(work);
+
+	if (i2o_device_parse_lct(c) != -EAGAIN)
+		change_ind = c->lct->change_ind + 1;
+
+#ifdef CONFIG_I2O_LCT_NOTIFY_ON_CHANGES
+	i2o_exec_lct_notify(c, change_ind);
+#endif
+};
+
+/**
+ *	i2o_exec_reply -  I2O Executive reply handler
+ *	@c: I2O controller from which the reply comes
+ *	@m: message id
+ *	@msg: pointer to the I2O reply message
+ *
+ *	This function is always called from interrupt context. If a POST WAIT
+ *	reply was received, pass it to the complete function. If a LCT NOTIFY
+ *	reply was received, a new event is created to handle the update.
+ *
+ *	Returns 0 on success and if the reply should not be flushed or > 0
+ *	on success and if the reply should be flushed. Returns negative error
+ *	code on failure and if the reply should be flushed.
+ */
+static int i2o_exec_reply(struct i2o_controller *c, u32 m,
+			  struct i2o_message *msg)
+{
+	u32 context;
+
+	if (le32_to_cpu(msg->u.head[0]) & MSG_FAIL) {
+		struct i2o_message __iomem *pmsg;
+		u32 pm;
+
+		/*
+		 * If Fail bit is set we must take the transaction context of
+		 * the preserved message to find the right request again.
+		 */
+
+		pm = le32_to_cpu(msg->body[3]);
+		pmsg = i2o_msg_in_to_virt(c, pm);
+		context = readl(&pmsg->u.s.tcntxt);
+
+		i2o_report_status(KERN_INFO, "i2o_core", msg);
+
+		/* Release the preserved msg */
+		i2o_msg_nop_mfa(c, pm);
+	} else
+		context = le32_to_cpu(msg->u.s.tcntxt);
+
+	if (context & 0x80000000)
+		return i2o_msg_post_wait_complete(c, m, msg, context);
+
+	if ((le32_to_cpu(msg->u.head[1]) >> 24) == I2O_CMD_LCT_NOTIFY) {
+		struct i2o_exec_lct_notify_work *work;
+
+		pr_debug("%s: LCT notify received\n", c->name);
+
+		work = kmalloc(sizeof(*work), GFP_ATOMIC);
+		if (!work)
+			return -ENOMEM;
+
+		work->c = c;
+
+		INIT_WORK(&work->work, i2o_exec_lct_modified);
+		queue_work(i2o_exec_driver.event_queue, &work->work);
+		return 1;
+	}
+
+	/*
+	 * If this happens, we want to dump the message to the syslog so
+	 * it can be sent back to the card manufacturer by the end user
+	 * to aid in debugging.
+	 *
+	 */
+	printk(KERN_WARNING "%s: Unsolicited message reply sent to core!"
+	       "Message dumped to syslog\n", c->name);
+	i2o_dump_message(msg);
+
+	return -EFAULT;
+}
+
+/**
+ *	i2o_exec_event - Event handling function
+ *	@work: Work item in occurring event
+ *
+ *	Handles events send by the Executive device. At the moment does not do
+ *	anything useful.
+ */
+static void i2o_exec_event(struct work_struct *work)
+{
+	struct i2o_event *evt = container_of(work, struct i2o_event, work);
+
+	if (likely(evt->i2o_dev))
+		osm_debug("Event received from device: %d\n",
+			  evt->i2o_dev->lct_data.tid);
+	kfree(evt);
+};
+
+/**
+ *	i2o_exec_lct_get - Get the IOP's Logical Configuration Table
+ *	@c: I2O controller from which the LCT should be fetched
+ *
+ *	Send a LCT NOTIFY request to the controller, and wait
+ *	I2O_TIMEOUT_LCT_GET seconds until arrival of response. If the LCT is
+ *	to large, retry it.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_exec_lct_get(struct i2o_controller *c)
+{
+	struct i2o_message *msg;
+	int i = 0;
+	int rc = -EAGAIN;
+
+	for (i = 1; i <= I2O_LCT_GET_TRIES; i++) {
+		msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+		if (IS_ERR(msg))
+			return PTR_ERR(msg);
+
+		msg->u.head[0] =
+		    cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
+		msg->u.head[1] =
+		    cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
+				ADAPTER_TID);
+		msg->body[0] = cpu_to_le32(0xffffffff);
+		msg->body[1] = cpu_to_le32(0x00000000);
+		msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
+		msg->body[3] = cpu_to_le32(c->dlct.phys);
+
+		rc = i2o_msg_post_wait(c, msg, I2O_TIMEOUT_LCT_GET);
+		if (rc < 0)
+			break;
+
+		rc = i2o_device_parse_lct(c);
+		if (rc != -EAGAIN)
+			break;
+	}
+
+	return rc;
+}
+
+/* Exec OSM driver struct */
+struct i2o_driver i2o_exec_driver = {
+	.name = OSM_NAME,
+	.reply = i2o_exec_reply,
+	.event = i2o_exec_event,
+	.classes = i2o_exec_class_id,
+	.driver = {
+		   .probe = i2o_exec_probe,
+		   .remove = i2o_exec_remove,
+		   },
+};
+
+/**
+ *	i2o_exec_init - Registers the Exec OSM
+ *
+ *	Registers the Exec OSM in the I2O core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int __init i2o_exec_init(void)
+{
+	return i2o_driver_register(&i2o_exec_driver);
+};
+
+/**
+ *	i2o_exec_exit - Removes the Exec OSM
+ *
+ *	Unregisters the Exec OSM from the I2O core.
+ */
+void i2o_exec_exit(void)
+{
+	i2o_driver_unregister(&i2o_exec_driver);
+};
+
+EXPORT_SYMBOL(i2o_msg_post_wait_mem);
+EXPORT_SYMBOL(i2o_exec_lct_get);
diff --git a/drivers/staging/i2o/i2o.h b/drivers/staging/i2o/i2o.h
new file mode 100644
index 000000000000..d23c3c20b201
--- /dev/null
+++ b/drivers/staging/i2o/i2o.h
@@ -0,0 +1,988 @@
+/*
+ * I2O kernel space accessible structures/APIs
+ *
+ * (c) Copyright 1999, 2000 Red Hat Software
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ *************************************************************************
+ *
+ * This header file defined the I2O APIs/structures for use by
+ * the I2O kernel modules.
+ *
+ */
+
+#ifndef _I2O_H
+#define _I2O_H
+
+#include <linux/i2o-dev.h>
+
+/* How many different OSM's are we allowing */
+#define I2O_MAX_DRIVERS		8
+
+#include <linux/pci.h>
+#include <linux/bug.h>
+#include <linux/dma-mapping.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>	/* work_struct */
+#include <linux/mempool.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+#include <linux/semaphore.h>	/* Needed for MUTEX init macros */
+
+#include <asm/io.h>
+
+/* message queue empty */
+#define I2O_QUEUE_EMPTY		0xffffffff
+
+/*
+ *	Cache strategies
+ */
+
+/*	The NULL strategy leaves everything up to the controller. This tends to be a
+ *	pessimal but functional choice.
+ */
+#define CACHE_NULL		0
+/*	Prefetch data when reading. We continually attempt to load the next 32 sectors
+ *	into the controller cache.
+ */
+#define CACHE_PREFETCH		1
+/*	Prefetch data when reading. We sometimes attempt to load the next 32 sectors
+ *	into the controller cache. When an I/O is less <= 8K we assume its probably
+ *	not sequential and don't prefetch (default)
+ */
+#define CACHE_SMARTFETCH	2
+/*	Data is written to the cache and then out on to the disk. The I/O must be
+ *	physically on the medium before the write is acknowledged (default without
+ *	NVRAM)
+ */
+#define CACHE_WRITETHROUGH	17
+/*	Data is written to the cache and then out on to the disk. The controller
+ *	is permitted to write back the cache any way it wants. (default if battery
+ *	backed NVRAM is present). It can be useful to set this for swap regardless of
+ *	battery state.
+ */
+#define CACHE_WRITEBACK		18
+/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
+ *	write large I/O's directly to disk bypassing the cache to avoid the extra
+ *	memory copy hits. Small writes are writeback cached
+ */
+#define CACHE_SMARTBACK		19
+/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
+ *	write large I/O's directly to disk bypassing the cache to avoid the extra
+ *	memory copy hits. Small writes are writethrough cached. Suitable for devices
+ *	lacking battery backup
+ */
+#define CACHE_SMARTTHROUGH	20
+
+/*
+ *	Ioctl structures
+ */
+
+#define 	BLKI2OGRSTRAT	_IOR('2', 1, int)
+#define 	BLKI2OGWSTRAT	_IOR('2', 2, int)
+#define 	BLKI2OSRSTRAT	_IOW('2', 3, int)
+#define 	BLKI2OSWSTRAT	_IOW('2', 4, int)
+
+/*
+ *	I2O Function codes
+ */
+
+/*
+ *	Executive Class
+ */
+#define	I2O_CMD_ADAPTER_ASSIGN		0xB3
+#define	I2O_CMD_ADAPTER_READ		0xB2
+#define	I2O_CMD_ADAPTER_RELEASE		0xB5
+#define	I2O_CMD_BIOS_INFO_SET		0xA5
+#define	I2O_CMD_BOOT_DEVICE_SET		0xA7
+#define	I2O_CMD_CONFIG_VALIDATE		0xBB
+#define	I2O_CMD_CONN_SETUP		0xCA
+#define	I2O_CMD_DDM_DESTROY		0xB1
+#define	I2O_CMD_DDM_ENABLE		0xD5
+#define	I2O_CMD_DDM_QUIESCE		0xC7
+#define	I2O_CMD_DDM_RESET		0xD9
+#define	I2O_CMD_DDM_SUSPEND		0xAF
+#define	I2O_CMD_DEVICE_ASSIGN		0xB7
+#define	I2O_CMD_DEVICE_RELEASE		0xB9
+#define	I2O_CMD_HRT_GET			0xA8
+#define	I2O_CMD_ADAPTER_CLEAR		0xBE
+#define	I2O_CMD_ADAPTER_CONNECT		0xC9
+#define	I2O_CMD_ADAPTER_RESET		0xBD
+#define	I2O_CMD_LCT_NOTIFY		0xA2
+#define	I2O_CMD_OUTBOUND_INIT		0xA1
+#define	I2O_CMD_PATH_ENABLE		0xD3
+#define	I2O_CMD_PATH_QUIESCE		0xC5
+#define	I2O_CMD_PATH_RESET		0xD7
+#define	I2O_CMD_STATIC_MF_CREATE	0xDD
+#define	I2O_CMD_STATIC_MF_RELEASE	0xDF
+#define	I2O_CMD_STATUS_GET		0xA0
+#define	I2O_CMD_SW_DOWNLOAD		0xA9
+#define	I2O_CMD_SW_UPLOAD		0xAB
+#define	I2O_CMD_SW_REMOVE		0xAD
+#define	I2O_CMD_SYS_ENABLE		0xD1
+#define	I2O_CMD_SYS_MODIFY		0xC1
+#define	I2O_CMD_SYS_QUIESCE		0xC3
+#define	I2O_CMD_SYS_TAB_SET		0xA3
+
+/*
+ * Utility Class
+ */
+#define I2O_CMD_UTIL_NOP		0x00
+#define I2O_CMD_UTIL_ABORT		0x01
+#define I2O_CMD_UTIL_CLAIM		0x09
+#define I2O_CMD_UTIL_RELEASE		0x0B
+#define I2O_CMD_UTIL_PARAMS_GET		0x06
+#define I2O_CMD_UTIL_PARAMS_SET		0x05
+#define I2O_CMD_UTIL_EVT_REGISTER	0x13
+#define I2O_CMD_UTIL_EVT_ACK		0x14
+#define I2O_CMD_UTIL_CONFIG_DIALOG	0x10
+#define I2O_CMD_UTIL_DEVICE_RESERVE	0x0D
+#define I2O_CMD_UTIL_DEVICE_RELEASE	0x0F
+#define I2O_CMD_UTIL_LOCK		0x17
+#define I2O_CMD_UTIL_LOCK_RELEASE	0x19
+#define I2O_CMD_UTIL_REPLY_FAULT_NOTIFY	0x15
+
+/*
+ * SCSI Host Bus Adapter Class
+ */
+#define I2O_CMD_SCSI_EXEC		0x81
+#define I2O_CMD_SCSI_ABORT		0x83
+#define I2O_CMD_SCSI_BUSRESET		0x27
+
+/*
+ * Bus Adapter Class
+ */
+#define I2O_CMD_BUS_ADAPTER_RESET	0x85
+#define I2O_CMD_BUS_RESET		0x87
+#define I2O_CMD_BUS_SCAN		0x89
+#define I2O_CMD_BUS_QUIESCE		0x8b
+
+/*
+ * Random Block Storage Class
+ */
+#define I2O_CMD_BLOCK_READ		0x30
+#define I2O_CMD_BLOCK_WRITE		0x31
+#define I2O_CMD_BLOCK_CFLUSH		0x37
+#define I2O_CMD_BLOCK_MLOCK		0x49
+#define I2O_CMD_BLOCK_MUNLOCK		0x4B
+#define I2O_CMD_BLOCK_MMOUNT		0x41
+#define I2O_CMD_BLOCK_MEJECT		0x43
+#define I2O_CMD_BLOCK_POWER		0x70
+
+#define I2O_CMD_PRIVATE			0xFF
+
+/* Command status values  */
+
+#define I2O_CMD_IN_PROGRESS	0x01
+#define I2O_CMD_REJECTED	0x02
+#define I2O_CMD_FAILED		0x03
+#define I2O_CMD_COMPLETED	0x04
+
+/* I2O API function return values */
+
+#define I2O_RTN_NO_ERROR			0
+#define I2O_RTN_NOT_INIT			1
+#define I2O_RTN_FREE_Q_EMPTY			2
+#define I2O_RTN_TCB_ERROR			3
+#define I2O_RTN_TRANSACTION_ERROR		4
+#define I2O_RTN_ADAPTER_ALREADY_INIT		5
+#define I2O_RTN_MALLOC_ERROR			6
+#define I2O_RTN_ADPTR_NOT_REGISTERED		7
+#define I2O_RTN_MSG_REPLY_TIMEOUT		8
+#define I2O_RTN_NO_STATUS			9
+#define I2O_RTN_NO_FIRM_VER			10
+#define	I2O_RTN_NO_LINK_SPEED			11
+
+/* Reply message status defines for all messages */
+
+#define I2O_REPLY_STATUS_SUCCESS                    	0x00
+#define I2O_REPLY_STATUS_ABORT_DIRTY                	0x01
+#define I2O_REPLY_STATUS_ABORT_NO_DATA_TRANSFER     	0x02
+#define	I2O_REPLY_STATUS_ABORT_PARTIAL_TRANSFER		0x03
+#define	I2O_REPLY_STATUS_ERROR_DIRTY			0x04
+#define	I2O_REPLY_STATUS_ERROR_NO_DATA_TRANSFER		0x05
+#define	I2O_REPLY_STATUS_ERROR_PARTIAL_TRANSFER		0x06
+#define	I2O_REPLY_STATUS_PROCESS_ABORT_DIRTY		0x08
+#define	I2O_REPLY_STATUS_PROCESS_ABORT_NO_DATA_TRANSFER	0x09
+#define	I2O_REPLY_STATUS_PROCESS_ABORT_PARTIAL_TRANSFER	0x0A
+#define	I2O_REPLY_STATUS_TRANSACTION_ERROR		0x0B
+#define	I2O_REPLY_STATUS_PROGRESS_REPORT		0x80
+
+/* Status codes and Error Information for Parameter functions */
+
+#define I2O_PARAMS_STATUS_SUCCESS		0x00
+#define I2O_PARAMS_STATUS_BAD_KEY_ABORT		0x01
+#define I2O_PARAMS_STATUS_BAD_KEY_CONTINUE   	0x02
+#define I2O_PARAMS_STATUS_BUFFER_FULL		0x03
+#define I2O_PARAMS_STATUS_BUFFER_TOO_SMALL	0x04
+#define I2O_PARAMS_STATUS_FIELD_UNREADABLE	0x05
+#define I2O_PARAMS_STATUS_FIELD_UNWRITEABLE	0x06
+#define I2O_PARAMS_STATUS_INSUFFICIENT_FIELDS	0x07
+#define I2O_PARAMS_STATUS_INVALID_GROUP_ID	0x08
+#define I2O_PARAMS_STATUS_INVALID_OPERATION	0x09
+#define I2O_PARAMS_STATUS_NO_KEY_FIELD		0x0A
+#define I2O_PARAMS_STATUS_NO_SUCH_FIELD		0x0B
+#define I2O_PARAMS_STATUS_NON_DYNAMIC_GROUP	0x0C
+#define I2O_PARAMS_STATUS_OPERATION_ERROR	0x0D
+#define I2O_PARAMS_STATUS_SCALAR_ERROR		0x0E
+#define I2O_PARAMS_STATUS_TABLE_ERROR		0x0F
+#define I2O_PARAMS_STATUS_WRONG_GROUP_TYPE	0x10
+
+/* DetailedStatusCode defines for Executive, DDM, Util and Transaction error
+ * messages: Table 3-2 Detailed Status Codes.*/
+
+#define I2O_DSC_SUCCESS                        0x0000
+#define I2O_DSC_BAD_KEY                        0x0002
+#define I2O_DSC_TCL_ERROR                      0x0003
+#define I2O_DSC_REPLY_BUFFER_FULL              0x0004
+#define I2O_DSC_NO_SUCH_PAGE                   0x0005
+#define I2O_DSC_INSUFFICIENT_RESOURCE_SOFT     0x0006
+#define I2O_DSC_INSUFFICIENT_RESOURCE_HARD     0x0007
+#define I2O_DSC_CHAIN_BUFFER_TOO_LARGE         0x0009
+#define I2O_DSC_UNSUPPORTED_FUNCTION           0x000A
+#define I2O_DSC_DEVICE_LOCKED                  0x000B
+#define I2O_DSC_DEVICE_RESET                   0x000C
+#define I2O_DSC_INAPPROPRIATE_FUNCTION         0x000D
+#define I2O_DSC_INVALID_INITIATOR_ADDRESS      0x000E
+#define I2O_DSC_INVALID_MESSAGE_FLAGS          0x000F
+#define I2O_DSC_INVALID_OFFSET                 0x0010
+#define I2O_DSC_INVALID_PARAMETER              0x0011
+#define I2O_DSC_INVALID_REQUEST                0x0012
+#define I2O_DSC_INVALID_TARGET_ADDRESS         0x0013
+#define I2O_DSC_MESSAGE_TOO_LARGE              0x0014
+#define I2O_DSC_MESSAGE_TOO_SMALL              0x0015
+#define I2O_DSC_MISSING_PARAMETER              0x0016
+#define I2O_DSC_TIMEOUT                        0x0017
+#define I2O_DSC_UNKNOWN_ERROR                  0x0018
+#define I2O_DSC_UNKNOWN_FUNCTION               0x0019
+#define I2O_DSC_UNSUPPORTED_VERSION            0x001A
+#define I2O_DSC_DEVICE_BUSY                    0x001B
+#define I2O_DSC_DEVICE_NOT_AVAILABLE           0x001C
+
+/* DetailedStatusCode defines for Block Storage Operation: Table 6-7 Detailed
+   Status Codes.*/
+
+#define I2O_BSA_DSC_SUCCESS               0x0000
+#define I2O_BSA_DSC_MEDIA_ERROR           0x0001
+#define I2O_BSA_DSC_ACCESS_ERROR          0x0002
+#define I2O_BSA_DSC_DEVICE_FAILURE        0x0003
+#define I2O_BSA_DSC_DEVICE_NOT_READY      0x0004
+#define I2O_BSA_DSC_MEDIA_NOT_PRESENT     0x0005
+#define I2O_BSA_DSC_MEDIA_LOCKED          0x0006
+#define I2O_BSA_DSC_MEDIA_FAILURE         0x0007
+#define I2O_BSA_DSC_PROTOCOL_FAILURE      0x0008
+#define I2O_BSA_DSC_BUS_FAILURE           0x0009
+#define I2O_BSA_DSC_ACCESS_VIOLATION      0x000A
+#define I2O_BSA_DSC_WRITE_PROTECTED       0x000B
+#define I2O_BSA_DSC_DEVICE_RESET          0x000C
+#define I2O_BSA_DSC_VOLUME_CHANGED        0x000D
+#define I2O_BSA_DSC_TIMEOUT               0x000E
+
+/* FailureStatusCodes, Table 3-3 Message Failure Codes */
+
+#define I2O_FSC_TRANSPORT_SERVICE_SUSPENDED             0x81
+#define I2O_FSC_TRANSPORT_SERVICE_TERMINATED            0x82
+#define I2O_FSC_TRANSPORT_CONGESTION                    0x83
+#define I2O_FSC_TRANSPORT_FAILURE                       0x84
+#define I2O_FSC_TRANSPORT_STATE_ERROR                   0x85
+#define I2O_FSC_TRANSPORT_TIME_OUT                      0x86
+#define I2O_FSC_TRANSPORT_ROUTING_FAILURE               0x87
+#define I2O_FSC_TRANSPORT_INVALID_VERSION               0x88
+#define I2O_FSC_TRANSPORT_INVALID_OFFSET                0x89
+#define I2O_FSC_TRANSPORT_INVALID_MSG_FLAGS             0x8A
+#define I2O_FSC_TRANSPORT_FRAME_TOO_SMALL               0x8B
+#define I2O_FSC_TRANSPORT_FRAME_TOO_LARGE               0x8C
+#define I2O_FSC_TRANSPORT_INVALID_TARGET_ID             0x8D
+#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_ID          0x8E
+#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_CONTEXT     0x8F
+#define I2O_FSC_TRANSPORT_UNKNOWN_FAILURE               0xFF
+
+/* Device Claim Types */
+#define	I2O_CLAIM_PRIMARY					0x01000000
+#define	I2O_CLAIM_MANAGEMENT					0x02000000
+#define	I2O_CLAIM_AUTHORIZED					0x03000000
+#define	I2O_CLAIM_SECONDARY					0x04000000
+
+/* Message header defines for VersionOffset */
+#define I2OVER15	0x0001
+#define I2OVER20	0x0002
+
+/* Default is 1.5 */
+#define I2OVERSION	I2OVER15
+
+#define SGL_OFFSET_0    I2OVERSION
+#define SGL_OFFSET_4    (0x0040 | I2OVERSION)
+#define SGL_OFFSET_5    (0x0050 | I2OVERSION)
+#define SGL_OFFSET_6    (0x0060 | I2OVERSION)
+#define SGL_OFFSET_7    (0x0070 | I2OVERSION)
+#define SGL_OFFSET_8    (0x0080 | I2OVERSION)
+#define SGL_OFFSET_9    (0x0090 | I2OVERSION)
+#define SGL_OFFSET_10   (0x00A0 | I2OVERSION)
+#define SGL_OFFSET_11   (0x00B0 | I2OVERSION)
+#define SGL_OFFSET_12   (0x00C0 | I2OVERSION)
+#define SGL_OFFSET(x)   (((x)<<4) | I2OVERSION)
+
+/* Transaction Reply Lists (TRL) Control Word structure */
+#define TRL_SINGLE_FIXED_LENGTH		0x00
+#define TRL_SINGLE_VARIABLE_LENGTH	0x40
+#define TRL_MULTIPLE_FIXED_LENGTH	0x80
+
+ /* msg header defines for MsgFlags */
+#define MSG_STATIC	0x0100
+#define MSG_64BIT_CNTXT	0x0200
+#define MSG_MULTI_TRANS	0x1000
+#define MSG_FAIL	0x2000
+#define MSG_FINAL	0x4000
+#define MSG_REPLY	0x8000
+
+ /* minimum size msg */
+#define THREE_WORD_MSG_SIZE	0x00030000
+#define FOUR_WORD_MSG_SIZE	0x00040000
+#define FIVE_WORD_MSG_SIZE	0x00050000
+#define SIX_WORD_MSG_SIZE	0x00060000
+#define SEVEN_WORD_MSG_SIZE	0x00070000
+#define EIGHT_WORD_MSG_SIZE	0x00080000
+#define NINE_WORD_MSG_SIZE	0x00090000
+#define TEN_WORD_MSG_SIZE	0x000A0000
+#define ELEVEN_WORD_MSG_SIZE	0x000B0000
+#define I2O_MESSAGE_SIZE(x)	((x)<<16)
+
+/* special TID assignments */
+#define ADAPTER_TID		0
+#define HOST_TID		1
+
+/* outbound queue defines */
+#define I2O_MAX_OUTBOUND_MSG_FRAMES	128
+#define I2O_OUTBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
+
+/* inbound queue definitions */
+#define I2O_MSG_INPOOL_MIN		32
+#define I2O_INBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
+
+#define I2O_POST_WAIT_OK	0
+#define I2O_POST_WAIT_TIMEOUT	-ETIMEDOUT
+
+#define I2O_CONTEXT_LIST_MIN_LENGTH	15
+#define I2O_CONTEXT_LIST_USED		0x01
+#define I2O_CONTEXT_LIST_DELETED	0x02
+
+/* timeouts */
+#define I2O_TIMEOUT_INIT_OUTBOUND_QUEUE	15
+#define I2O_TIMEOUT_MESSAGE_GET		5
+#define I2O_TIMEOUT_RESET		30
+#define I2O_TIMEOUT_STATUS_GET		5
+#define I2O_TIMEOUT_LCT_GET		360
+#define I2O_TIMEOUT_SCSI_SCB_ABORT	240
+
+/* retries */
+#define I2O_HRT_GET_TRIES		3
+#define I2O_LCT_GET_TRIES		3
+
+/* defines for max_sectors and max_phys_segments */
+#define I2O_MAX_SECTORS			1024
+#define I2O_MAX_SECTORS_LIMITED		128
+#define I2O_MAX_PHYS_SEGMENTS		BLK_MAX_SEGMENTS
+
+/*
+ *	Message structures
+ */
+struct i2o_message {
+	union {
+		struct {
+			u8 version_offset;
+			u8 flags;
+			u16 size;
+			u32 target_tid:12;
+			u32 init_tid:12;
+			u32 function:8;
+			u32 icntxt;	/* initiator context */
+			u32 tcntxt;	/* transaction context */
+		} s;
+		u32 head[4];
+	} u;
+	/* List follows */
+	u32 body[0];
+};
+
+/* MFA and I2O message used by mempool */
+struct i2o_msg_mfa {
+	u32 mfa;		/* MFA returned by the controller */
+	struct i2o_message msg;	/* I2O message */
+};
+
+/*
+ *	Each I2O device entity has one of these. There is one per device.
+ */
+struct i2o_device {
+	i2o_lct_entry lct_data;	/* Device LCT information */
+
+	struct i2o_controller *iop;	/* Controlling IOP */
+	struct list_head list;	/* node in IOP devices list */
+
+	struct device device;
+
+	struct mutex lock;	/* device lock */
+};
+
+/*
+ *	Event structure provided to the event handling function
+ */
+struct i2o_event {
+	struct work_struct work;
+	struct i2o_device *i2o_dev;	/* I2O device pointer from which the
+					   event reply was initiated */
+	u16 size;		/* Size of data in 32-bit words */
+	u32 tcntxt;		/* Transaction context used at
+				   registration */
+	u32 event_indicator;	/* Event indicator from reply */
+	u32 data[0];		/* Event data from reply */
+};
+
+/*
+ *	I2O classes which could be handled by the OSM
+ */
+struct i2o_class_id {
+	u16 class_id:12;
+};
+
+/*
+ *	I2O driver structure for OSMs
+ */
+struct i2o_driver {
+	char *name;		/* OSM name */
+	int context;		/* Low 8 bits of the transaction info */
+	struct i2o_class_id *classes;	/* I2O classes that this OSM handles */
+
+	/* Message reply handler */
+	int (*reply) (struct i2o_controller *, u32, struct i2o_message *);
+
+	/* Event handler */
+	work_func_t event;
+
+	struct workqueue_struct *event_queue;	/* Event queue */
+
+	struct device_driver driver;
+
+	/* notification of changes */
+	void (*notify_controller_add) (struct i2o_controller *);
+	void (*notify_controller_remove) (struct i2o_controller *);
+	void (*notify_device_add) (struct i2o_device *);
+	void (*notify_device_remove) (struct i2o_device *);
+
+	struct semaphore lock;
+};
+
+/*
+ *	Contains DMA mapped address information
+ */
+struct i2o_dma {
+	void *virt;
+	dma_addr_t phys;
+	size_t len;
+};
+
+/*
+ *	Contains slab cache and mempool information
+ */
+struct i2o_pool {
+	char *name;
+	struct kmem_cache *slab;
+	mempool_t *mempool;
+};
+
+/*
+ *	Contains IO mapped address information
+ */
+struct i2o_io {
+	void __iomem *virt;
+	unsigned long phys;
+	unsigned long len;
+};
+
+/*
+ *	Context queue entry, used for 32-bit context on 64-bit systems
+ */
+struct i2o_context_list_element {
+	struct list_head list;
+	u32 context;
+	void *ptr;
+	unsigned long timestamp;
+};
+
+/*
+ * Each I2O controller has one of these objects
+ */
+struct i2o_controller {
+	char name[16];
+	int unit;
+	int type;
+
+	struct pci_dev *pdev;	/* PCI device */
+
+	unsigned int promise:1;	/* Promise controller */
+	unsigned int adaptec:1;	/* DPT / Adaptec controller */
+	unsigned int raptor:1;	/* split bar */
+	unsigned int no_quiesce:1;	/* dont quiesce before reset */
+	unsigned int short_req:1;	/* use small block sizes */
+	unsigned int limit_sectors:1;	/* limit number of sectors / request */
+	unsigned int pae_support:1;	/* controller has 64-bit SGL support */
+
+	struct list_head devices;	/* list of I2O devices */
+	struct list_head list;	/* Controller list */
+
+	void __iomem *in_port;	/* Inbout port address */
+	void __iomem *out_port;	/* Outbound port address */
+	void __iomem *irq_status;	/* Interrupt status register address */
+	void __iomem *irq_mask;	/* Interrupt mask register address */
+
+	struct i2o_dma status;	/* IOP status block */
+
+	struct i2o_dma hrt;	/* HW Resource Table */
+	i2o_lct *lct;		/* Logical Config Table */
+	struct i2o_dma dlct;	/* Temp LCT */
+	struct mutex lct_lock;	/* Lock for LCT updates */
+	struct i2o_dma status_block;	/* IOP status block */
+
+	struct i2o_io base;	/* controller messaging unit */
+	struct i2o_io in_queue;	/* inbound message queue Host->IOP */
+	struct i2o_dma out_queue;	/* outbound message queue IOP->Host */
+
+	struct i2o_pool in_msg;	/* mempool for inbound messages */
+
+	unsigned int battery:1;	/* Has a battery backup */
+	unsigned int io_alloc:1;	/* An I/O resource was allocated */
+	unsigned int mem_alloc:1;	/* A memory resource was allocated */
+
+	struct resource io_resource;	/* I/O resource allocated to the IOP */
+	struct resource mem_resource;	/* Mem resource allocated to the IOP */
+
+	struct device device;
+	struct i2o_device *exec;	/* Executive */
+#if BITS_PER_LONG == 64
+	spinlock_t context_list_lock;	/* lock for context_list */
+	atomic_t context_list_counter;	/* needed for unique contexts */
+	struct list_head context_list;	/* list of context id's
+					   and pointers */
+#endif
+	spinlock_t lock;	/* lock for controller
+				   configuration */
+	void *driver_data[I2O_MAX_DRIVERS];	/* storage for drivers */
+};
+
+/*
+ * I2O System table entry
+ *
+ * The system table contains information about all the IOPs in the
+ * system.  It is sent to all IOPs so that they can create peer2peer
+ * connections between them.
+ */
+struct i2o_sys_tbl_entry {
+	u16 org_id;
+	u16 reserved1;
+	u32 iop_id:12;
+	u32 reserved2:20;
+	u16 seg_num:12;
+	u16 i2o_version:4;
+	u8 iop_state;
+	u8 msg_type;
+	u16 frame_size;
+	u16 reserved3;
+	u32 last_changed;
+	u32 iop_capabilities;
+	u32 inbound_low;
+	u32 inbound_high;
+};
+
+struct i2o_sys_tbl {
+	u8 num_entries;
+	u8 version;
+	u16 reserved1;
+	u32 change_ind;
+	u32 reserved2;
+	u32 reserved3;
+	struct i2o_sys_tbl_entry iops[0];
+};
+
+extern struct list_head i2o_controllers;
+
+/* Message functions */
+extern struct i2o_message *i2o_msg_get_wait(struct i2o_controller *, int);
+extern int i2o_msg_post_wait_mem(struct i2o_controller *, struct i2o_message *,
+				 unsigned long, struct i2o_dma *);
+
+/* IOP functions */
+extern int i2o_status_get(struct i2o_controller *);
+
+extern int i2o_event_register(struct i2o_device *, struct i2o_driver *, int,
+			      u32);
+extern struct i2o_device *i2o_iop_find_device(struct i2o_controller *, u16);
+extern struct i2o_controller *i2o_find_iop(int);
+
+/* Functions needed for handling 64-bit pointers in 32-bit context */
+#if BITS_PER_LONG == 64
+extern u32 i2o_cntxt_list_add(struct i2o_controller *, void *);
+extern void *i2o_cntxt_list_get(struct i2o_controller *, u32);
+extern u32 i2o_cntxt_list_remove(struct i2o_controller *, void *);
+extern u32 i2o_cntxt_list_get_ptr(struct i2o_controller *, void *);
+
+static inline u32 i2o_ptr_low(void *ptr)
+{
+	return (u32) (u64) ptr;
+};
+
+static inline u32 i2o_ptr_high(void *ptr)
+{
+	return (u32) ((u64) ptr >> 32);
+};
+
+static inline u32 i2o_dma_low(dma_addr_t dma_addr)
+{
+	return (u32) (u64) dma_addr;
+};
+
+static inline u32 i2o_dma_high(dma_addr_t dma_addr)
+{
+	return (u32) ((u64) dma_addr >> 32);
+};
+#else
+static inline u32 i2o_cntxt_list_add(struct i2o_controller *c, void *ptr)
+{
+	return (u32) ptr;
+};
+
+static inline void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
+{
+	return (void *)context;
+};
+
+static inline u32 i2o_cntxt_list_remove(struct i2o_controller *c, void *ptr)
+{
+	return (u32) ptr;
+};
+
+static inline u32 i2o_cntxt_list_get_ptr(struct i2o_controller *c, void *ptr)
+{
+	return (u32) ptr;
+};
+
+static inline u32 i2o_ptr_low(void *ptr)
+{
+	return (u32) ptr;
+};
+
+static inline u32 i2o_ptr_high(void *ptr)
+{
+	return 0;
+};
+
+static inline u32 i2o_dma_low(dma_addr_t dma_addr)
+{
+	return (u32) dma_addr;
+};
+
+static inline u32 i2o_dma_high(dma_addr_t dma_addr)
+{
+	return 0;
+};
+#endif
+
+extern u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size);
+extern dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
+					    size_t size,
+					    enum dma_data_direction direction,
+					    u32 ** sg_ptr);
+extern int i2o_dma_map_sg(struct i2o_controller *c,
+				 struct scatterlist *sg, int sg_count,
+				 enum dma_data_direction direction,
+				 u32 ** sg_ptr);
+extern int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr, size_t len);
+extern void i2o_dma_free(struct device *dev, struct i2o_dma *addr);
+extern int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr,
+								size_t len);
+extern int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
+				 size_t size, int min_nr);
+extern void i2o_pool_free(struct i2o_pool *pool);
+/* I2O driver (OSM) functions */
+extern int i2o_driver_register(struct i2o_driver *);
+extern void i2o_driver_unregister(struct i2o_driver *);
+
+/**
+ *	i2o_driver_notify_controller_add - Send notification of added controller
+ *	@drv: I2O driver
+ *	@c: I2O controller
+ *
+ *	Send notification of added controller to a single registered driver.
+ */
+static inline void i2o_driver_notify_controller_add(struct i2o_driver *drv,
+						    struct i2o_controller *c)
+{
+	if (drv->notify_controller_add)
+		drv->notify_controller_add(c);
+};
+
+/**
+ *	i2o_driver_notify_controller_remove - Send notification of removed controller
+ *	@drv: I2O driver
+ *	@c: I2O controller
+ *
+ *	Send notification of removed controller to a single registered driver.
+ */
+static inline void i2o_driver_notify_controller_remove(struct i2o_driver *drv,
+						       struct i2o_controller *c)
+{
+	if (drv->notify_controller_remove)
+		drv->notify_controller_remove(c);
+};
+
+/**
+ *	i2o_driver_notify_device_add - Send notification of added device
+ *	@drv: I2O driver
+ *	@i2o_dev: the added i2o_device
+ *
+ *	Send notification of added device to a single registered driver.
+ */
+static inline void i2o_driver_notify_device_add(struct i2o_driver *drv,
+						struct i2o_device *i2o_dev)
+{
+	if (drv->notify_device_add)
+		drv->notify_device_add(i2o_dev);
+};
+
+/**
+ *	i2o_driver_notify_device_remove - Send notification of removed device
+ *	@drv: I2O driver
+ *	@i2o_dev: the added i2o_device
+ *
+ *	Send notification of removed device to a single registered driver.
+ */
+static inline void i2o_driver_notify_device_remove(struct i2o_driver *drv,
+						   struct i2o_device *i2o_dev)
+{
+	if (drv->notify_device_remove)
+		drv->notify_device_remove(i2o_dev);
+};
+
+extern void i2o_driver_notify_controller_add_all(struct i2o_controller *);
+extern void i2o_driver_notify_controller_remove_all(struct i2o_controller *);
+extern void i2o_driver_notify_device_add_all(struct i2o_device *);
+extern void i2o_driver_notify_device_remove_all(struct i2o_device *);
+
+/* I2O device functions */
+extern int i2o_device_claim(struct i2o_device *);
+extern int i2o_device_claim_release(struct i2o_device *);
+
+/* Exec OSM functions */
+extern int i2o_exec_lct_get(struct i2o_controller *);
+
+/* device / driver / kobject conversion functions */
+#define to_i2o_driver(drv) container_of(drv,struct i2o_driver, driver)
+#define to_i2o_device(dev) container_of(dev, struct i2o_device, device)
+#define to_i2o_controller(dev) container_of(dev, struct i2o_controller, device)
+
+/**
+ *	i2o_out_to_virt - Turn an I2O message to a virtual address
+ *	@c: controller
+ *	@m: message engine value
+ *
+ *	Turn a receive message from an I2O controller bus address into
+ *	a Linux virtual address. The shared page frame is a linear block
+ *	so we simply have to shift the offset. This function does not
+ *	work for sender side messages as they are ioremap objects
+ *	provided by the I2O controller.
+ */
+static inline struct i2o_message *i2o_msg_out_to_virt(struct i2o_controller *c,
+						      u32 m)
+{
+	BUG_ON(m < c->out_queue.phys
+	       || m >= c->out_queue.phys + c->out_queue.len);
+
+	return c->out_queue.virt + (m - c->out_queue.phys);
+};
+
+/**
+ *	i2o_msg_in_to_virt - Turn an I2O message to a virtual address
+ *	@c: controller
+ *	@m: message engine value
+ *
+ *	Turn a send message from an I2O controller bus address into
+ *	a Linux virtual address. The shared page frame is a linear block
+ *	so we simply have to shift the offset. This function does not
+ *	work for receive side messages as they are kmalloc objects
+ *	in a different pool.
+ */
+static inline struct i2o_message __iomem *i2o_msg_in_to_virt(struct
+							     i2o_controller *c,
+							     u32 m)
+{
+	return c->in_queue.virt + m;
+};
+
+/**
+ *	i2o_msg_get - obtain an I2O message from the IOP
+ *	@c: I2O controller
+ *
+ *	This function tries to get a message frame. If no message frame is
+ *	available do not wait until one is available (see also i2o_msg_get_wait).
+ *	The returned pointer to the message frame is not in I/O memory, it is
+ *	allocated from a mempool. But because a MFA is allocated from the
+ *	controller too it is guaranteed that i2o_msg_post() will never fail.
+ *
+ *	On a success a pointer to the message frame is returned. If the message
+ *	queue is empty -EBUSY is returned and if no memory is available -ENOMEM
+ *	is returned.
+ */
+static inline struct i2o_message *i2o_msg_get(struct i2o_controller *c)
+{
+	struct i2o_msg_mfa *mmsg = mempool_alloc(c->in_msg.mempool, GFP_ATOMIC);
+	if (!mmsg)
+		return ERR_PTR(-ENOMEM);
+
+	mmsg->mfa = readl(c->in_port);
+	if (unlikely(mmsg->mfa >= c->in_queue.len)) {
+		u32 mfa = mmsg->mfa;
+
+		mempool_free(mmsg, c->in_msg.mempool);
+
+		if (mfa == I2O_QUEUE_EMPTY)
+			return ERR_PTR(-EBUSY);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return &mmsg->msg;
+};
+
+/**
+ *	i2o_msg_post - Post I2O message to I2O controller
+ *	@c: I2O controller to which the message should be send
+ *	@msg: message returned by i2o_msg_get()
+ *
+ *	Post the message to the I2O controller and return immediately.
+ */
+static inline void i2o_msg_post(struct i2o_controller *c,
+				struct i2o_message *msg)
+{
+	struct i2o_msg_mfa *mmsg;
+
+	mmsg = container_of(msg, struct i2o_msg_mfa, msg);
+	memcpy_toio(i2o_msg_in_to_virt(c, mmsg->mfa), msg,
+		    (le32_to_cpu(msg->u.head[0]) >> 16) << 2);
+	writel(mmsg->mfa, c->in_port);
+	mempool_free(mmsg, c->in_msg.mempool);
+};
+
+/**
+ * 	i2o_msg_post_wait - Post and wait a message and wait until return
+ *	@c: controller
+ *	@msg: message to post
+ *	@timeout: time in seconds to wait
+ *
+ * 	This API allows an OSM to post a message and then be told whether or
+ *	not the system received a successful reply. If the message times out
+ *	then the value '-ETIMEDOUT' is returned.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static inline int i2o_msg_post_wait(struct i2o_controller *c,
+				    struct i2o_message *msg,
+				    unsigned long timeout)
+{
+	return i2o_msg_post_wait_mem(c, msg, timeout, NULL);
+};
+
+/**
+ *	i2o_msg_nop_mfa - Returns a fetched MFA back to the controller
+ *	@c: I2O controller from which the MFA was fetched
+ *	@mfa: MFA which should be returned
+ *
+ *	This function must be used for preserved messages, because i2o_msg_nop()
+ *	also returns the allocated memory back to the msg_pool mempool.
+ */
+static inline void i2o_msg_nop_mfa(struct i2o_controller *c, u32 mfa)
+{
+	struct i2o_message __iomem *msg;
+	u32 nop[3] = {
+		THREE_WORD_MSG_SIZE | SGL_OFFSET_0,
+		I2O_CMD_UTIL_NOP << 24 | HOST_TID << 12 | ADAPTER_TID,
+		0x00000000
+	};
+
+	msg = i2o_msg_in_to_virt(c, mfa);
+	memcpy_toio(msg, nop, sizeof(nop));
+	writel(mfa, c->in_port);
+};
+
+/**
+ *	i2o_msg_nop - Returns a message which is not used
+ *	@c: I2O controller from which the message was created
+ *	@msg: message which should be returned
+ *
+ *	If you fetch a message via i2o_msg_get, and can't use it, you must
+ *	return the message with this function. Otherwise the MFA is lost as well
+ *	as the allocated memory from the mempool.
+ */
+static inline void i2o_msg_nop(struct i2o_controller *c,
+			       struct i2o_message *msg)
+{
+	struct i2o_msg_mfa *mmsg;
+	mmsg = container_of(msg, struct i2o_msg_mfa, msg);
+
+	i2o_msg_nop_mfa(c, mmsg->mfa);
+	mempool_free(mmsg, c->in_msg.mempool);
+};
+
+/**
+ *	i2o_flush_reply - Flush reply from I2O controller
+ *	@c: I2O controller
+ *	@m: the message identifier
+ *
+ *	The I2O controller must be informed that the reply message is not needed
+ *	anymore. If you forget to flush the reply, the message frame can't be
+ *	used by the controller anymore and is therefore lost.
+ */
+static inline void i2o_flush_reply(struct i2o_controller *c, u32 m)
+{
+	writel(m, c->out_port);
+};
+
+/*
+ *	Endian handling wrapped into the macro - keeps the core code
+ *	cleaner.
+ */
+
+#define i2o_raw_writel(val, mem)	__raw_writel(cpu_to_le32(val), mem)
+
+extern int i2o_parm_field_get(struct i2o_device *, int, int, void *, int);
+extern int i2o_parm_table_get(struct i2o_device *, int, int, int, void *, int,
+			      void *, int);
+
+/* debugging and troubleshooting/diagnostic helpers. */
+#define osm_printk(level, format, arg...)  \
+	printk(level "%s: " format, OSM_NAME , ## arg)
+
+#ifdef DEBUG
+#define osm_debug(format, arg...) \
+	osm_printk(KERN_DEBUG, format , ## arg)
+#else
+#define osm_debug(format, arg...) \
+        do { } while (0)
+#endif
+
+#define osm_err(format, arg...)		\
+	osm_printk(KERN_ERR, format , ## arg)
+#define osm_info(format, arg...)		\
+	osm_printk(KERN_INFO, format , ## arg)
+#define osm_warn(format, arg...)		\
+	osm_printk(KERN_WARNING, format , ## arg)
+
+/* debugging functions */
+extern void i2o_report_status(const char *, const char *, struct i2o_message *);
+extern void i2o_dump_message(struct i2o_message *);
+extern void i2o_dump_hrt(struct i2o_controller *c);
+extern void i2o_debug_state(struct i2o_controller *c);
+
+#endif				/* _I2O_H */
diff --git a/drivers/staging/i2o/i2o_block.c b/drivers/staging/i2o/i2o_block.c
new file mode 100644
index 000000000000..0a13c64ce000
--- /dev/null
+++ b/drivers/staging/i2o/i2o_block.c
@@ -0,0 +1,1228 @@
+/*
+ *	Block OSM
+ *
+ * 	Copyright (C) 1999-2002	Red Hat Software
+ *
+ *	Written by Alan Cox, Building Number Three Ltd
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	This program is distributed in the hope that it will be useful, but
+ *	WITHOUT ANY WARRANTY; without even the implied warranty of
+ *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *	General Public License for more details.
+ *
+ *	For the purpose of avoiding doubt the preferred form of the work
+ *	for making modifications shall be a standards compliant form such
+ *	gzipped tar and not one requiring a proprietary or patent encumbered
+ *	tool to unpack.
+ *
+ *	Fixes/additions:
+ *		Steve Ralston:
+ *			Multiple device handling error fixes,
+ *			Added a queue depth.
+ *		Alan Cox:
+ *			FC920 has an rmw bug. Dont or in the end marker.
+ *			Removed queue walk, fixed for 64bitness.
+ *			Rewrote much of the code over time
+ *			Added indirect block lists
+ *			Handle 64K limits on many controllers
+ *			Don't use indirects on the Promise (breaks)
+ *			Heavily chop down the queue depths
+ *		Deepak Saxena:
+ *			Independent queues per IOP
+ *			Support for dynamic device creation/deletion
+ *			Code cleanup
+ *	    		Support for larger I/Os through merge* functions
+ *			(taken from DAC960 driver)
+ *		Boji T Kannanthanam:
+ *			Set the I2O Block devices to be detected in increasing
+ *			order of TIDs during boot.
+ *			Search and set the I2O block device that we boot off
+ *			from as the first device to be claimed (as /dev/i2o/hda)
+ *			Properly attach/detach I2O gendisk structure from the
+ *			system gendisk list. The I2O block devices now appear in
+ *			/proc/partitions.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Minor bugfixes for 2.6.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "i2o.h"
+#include <linux/mutex.h>
+
+#include <linux/mempool.h>
+
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+
+#include <scsi/scsi.h>
+
+#include "i2o_block.h"
+
+#define OSM_NAME	"block-osm"
+#define OSM_VERSION	"1.325"
+#define OSM_DESCRIPTION	"I2O Block Device OSM"
+
+static DEFINE_MUTEX(i2o_block_mutex);
+static struct i2o_driver i2o_block_driver;
+
+/* global Block OSM request mempool */
+static struct i2o_block_mempool i2o_blk_req_pool;
+
+/* Block OSM class handling definition */
+static struct i2o_class_id i2o_block_class_id[] = {
+	{I2O_CLASS_RANDOM_BLOCK_STORAGE},
+	{I2O_CLASS_END}
+};
+
+/**
+ *	i2o_block_device_free - free the memory of the I2O Block device
+ *	@dev: I2O Block device, which should be cleaned up
+ *
+ *	Frees the request queue, gendisk and the i2o_block_device structure.
+ */
+static void i2o_block_device_free(struct i2o_block_device *dev)
+{
+	blk_cleanup_queue(dev->gd->queue);
+
+	put_disk(dev->gd);
+
+	kfree(dev);
+};
+
+/**
+ *	i2o_block_remove - remove the I2O Block device from the system again
+ *	@dev: I2O Block device which should be removed
+ *
+ *	Remove gendisk from system and free all allocated memory.
+ *
+ *	Always returns 0.
+ */
+static int i2o_block_remove(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+	struct i2o_block_device *i2o_blk_dev = dev_get_drvdata(dev);
+
+	osm_info("device removed (TID: %03x): %s\n", i2o_dev->lct_data.tid,
+		 i2o_blk_dev->gd->disk_name);
+
+	i2o_event_register(i2o_dev, &i2o_block_driver, 0, 0);
+
+	del_gendisk(i2o_blk_dev->gd);
+
+	dev_set_drvdata(dev, NULL);
+
+	i2o_device_claim_release(i2o_dev);
+
+	i2o_block_device_free(i2o_blk_dev);
+
+	return 0;
+};
+
+/**
+ *	i2o_block_device flush - Flush all dirty data of I2O device dev
+ *	@dev: I2O device which should be flushed
+ *
+ *	Flushes all dirty data on device dev.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_device_flush(struct i2o_device *dev)
+{
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_CFLUSH << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(60 << 16);
+	osm_debug("Flushing...\n");
+
+	return i2o_msg_post_wait(dev->iop, msg, 60);
+};
+
+/**
+ *	i2o_block_device_mount - Mount (load) the media of device dev
+ *	@dev: I2O device which should receive the mount request
+ *	@media_id: Media Identifier
+ *
+ *	Load a media into drive. Identifier should be set to -1, because the
+ *	spec does not support any other value.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_device_mount(struct i2o_device *dev, u32 media_id)
+{
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_MMOUNT << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(-1);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	osm_debug("Mounting...\n");
+
+	return i2o_msg_post_wait(dev->iop, msg, 2);
+};
+
+/**
+ *	i2o_block_device_lock - Locks the media of device dev
+ *	@dev: I2O device which should receive the lock request
+ *	@media_id: Media Identifier
+ *
+ *	Lock media of device dev to prevent removal. The media identifier
+ *	should be set to -1, because the spec does not support any other value.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_device_lock(struct i2o_device *dev, u32 media_id)
+{
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_MLOCK << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(-1);
+	osm_debug("Locking...\n");
+
+	return i2o_msg_post_wait(dev->iop, msg, 2);
+};
+
+/**
+ *	i2o_block_device_unlock - Unlocks the media of device dev
+ *	@dev: I2O device which should receive the unlocked request
+ *	@media_id: Media Identifier
+ *
+ *	Unlocks the media in device dev. The media identifier should be set to
+ *	-1, because the spec does not support any other value.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_device_unlock(struct i2o_device *dev, u32 media_id)
+{
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_MUNLOCK << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(media_id);
+	osm_debug("Unlocking...\n");
+
+	return i2o_msg_post_wait(dev->iop, msg, 2);
+};
+
+/**
+ *	i2o_block_device_power - Power management for device dev
+ *	@dev: I2O device which should receive the power management request
+ *	@op: Operation to send
+ *
+ *	Send a power management request to the device dev.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_device_power(struct i2o_block_device *dev, u8 op)
+{
+	struct i2o_device *i2o_dev = dev->i2o_dev;
+	struct i2o_controller *c = i2o_dev->iop;
+	struct i2o_message *msg;
+	int rc;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_POWER << 24 | HOST_TID << 12 | i2o_dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(op << 24);
+	osm_debug("Power...\n");
+
+	rc = i2o_msg_post_wait(c, msg, 60);
+	if (!rc)
+		dev->power = op;
+
+	return rc;
+};
+
+/**
+ *	i2o_block_request_alloc - Allocate an I2O block request struct
+ *
+ *	Allocates an I2O block request struct and initialize the list.
+ *
+ *	Returns a i2o_block_request pointer on success or negative error code
+ *	on failure.
+ */
+static inline struct i2o_block_request *i2o_block_request_alloc(void)
+{
+	struct i2o_block_request *ireq;
+
+	ireq = mempool_alloc(i2o_blk_req_pool.pool, GFP_ATOMIC);
+	if (!ireq)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ireq->queue);
+	sg_init_table(ireq->sg_table, I2O_MAX_PHYS_SEGMENTS);
+
+	return ireq;
+};
+
+/**
+ *	i2o_block_request_free - Frees a I2O block request
+ *	@ireq: I2O block request which should be freed
+ *
+ *	Frees the allocated memory (give it back to the request mempool).
+ */
+static inline void i2o_block_request_free(struct i2o_block_request *ireq)
+{
+	mempool_free(ireq, i2o_blk_req_pool.pool);
+};
+
+/**
+ *	i2o_block_sglist_alloc - Allocate the SG list and map it
+ *	@c: I2O controller to which the request belongs
+ *	@ireq: I2O block request
+ *	@mptr: message body pointer
+ *
+ *	Builds the SG list and map it to be accessible by the controller.
+ *
+ *	Returns 0 on failure or 1 on success.
+ */
+static inline int i2o_block_sglist_alloc(struct i2o_controller *c,
+					 struct i2o_block_request *ireq,
+					 u32 ** mptr)
+{
+	int nents;
+	enum dma_data_direction direction;
+
+	ireq->dev = &c->pdev->dev;
+	nents = blk_rq_map_sg(ireq->req->q, ireq->req, ireq->sg_table);
+
+	if (rq_data_dir(ireq->req) == READ)
+		direction = PCI_DMA_FROMDEVICE;
+	else
+		direction = PCI_DMA_TODEVICE;
+
+	ireq->sg_nents = nents;
+
+	return i2o_dma_map_sg(c, ireq->sg_table, nents, direction, mptr);
+};
+
+/**
+ *	i2o_block_sglist_free - Frees the SG list
+ *	@ireq: I2O block request from which the SG should be freed
+ *
+ *	Frees the SG list from the I2O block request.
+ */
+static inline void i2o_block_sglist_free(struct i2o_block_request *ireq)
+{
+	enum dma_data_direction direction;
+
+	if (rq_data_dir(ireq->req) == READ)
+		direction = PCI_DMA_FROMDEVICE;
+	else
+		direction = PCI_DMA_TODEVICE;
+
+	dma_unmap_sg(ireq->dev, ireq->sg_table, ireq->sg_nents, direction);
+};
+
+/**
+ *	i2o_block_prep_req_fn - Allocates I2O block device specific struct
+ *	@q: request queue for the request
+ *	@req: the request to prepare
+ *
+ *	Allocate the necessary i2o_block_request struct and connect it to
+ *	the request. This is needed that we not lose the SG list later on.
+ *
+ *	Returns BLKPREP_OK on success or BLKPREP_DEFER on failure.
+ */
+static int i2o_block_prep_req_fn(struct request_queue *q, struct request *req)
+{
+	struct i2o_block_device *i2o_blk_dev = q->queuedata;
+	struct i2o_block_request *ireq;
+
+	if (unlikely(!i2o_blk_dev)) {
+		osm_err("block device already removed\n");
+		return BLKPREP_KILL;
+	}
+
+	/* connect the i2o_block_request to the request */
+	if (!req->special) {
+		ireq = i2o_block_request_alloc();
+		if (IS_ERR(ireq)) {
+			osm_debug("unable to allocate i2o_block_request!\n");
+			return BLKPREP_DEFER;
+		}
+
+		ireq->i2o_blk_dev = i2o_blk_dev;
+		req->special = ireq;
+		ireq->req = req;
+	}
+	/* do not come back here */
+	req->cmd_flags |= REQ_DONTPREP;
+
+	return BLKPREP_OK;
+};
+
+/**
+ *	i2o_block_delayed_request_fn - delayed request queue function
+ *	@work: the delayed request with the queue to start
+ *
+ *	If the request queue is stopped for a disk, and there is no open
+ *	request, a new event is created, which calls this function to start
+ *	the queue after I2O_BLOCK_REQUEST_TIME. Otherwise the queue will never
+ *	be started again.
+ */
+static void i2o_block_delayed_request_fn(struct work_struct *work)
+{
+	struct i2o_block_delayed_request *dreq =
+		container_of(work, struct i2o_block_delayed_request,
+			     work.work);
+	struct request_queue *q = dreq->queue;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	blk_start_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+	kfree(dreq);
+};
+
+/**
+ *	i2o_block_end_request - Post-processing of completed commands
+ *	@req: request which should be completed
+ *	@error: 0 for success, < 0 for error
+ *	@nr_bytes: number of bytes to complete
+ *
+ *	Mark the request as complete. The lock must not be held when entering.
+ *
+ */
+static void i2o_block_end_request(struct request *req, int error,
+				  int nr_bytes)
+{
+	struct i2o_block_request *ireq = req->special;
+	struct i2o_block_device *dev = ireq->i2o_blk_dev;
+	struct request_queue *q = req->q;
+	unsigned long flags;
+
+	if (blk_end_request(req, error, nr_bytes))
+		if (error)
+			blk_end_request_all(req, -EIO);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	if (likely(dev)) {
+		dev->open_queue_depth--;
+		list_del(&ireq->queue);
+	}
+
+	blk_start_queue(q);
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	i2o_block_sglist_free(ireq);
+	i2o_block_request_free(ireq);
+};
+
+/**
+ *	i2o_block_reply - Block OSM reply handler.
+ *	@c: I2O controller from which the message arrives
+ *	@m: message id of reply
+ *	@msg: the actual I2O message reply
+ *
+ *	This function gets all the message replies.
+ *
+ */
+static int i2o_block_reply(struct i2o_controller *c, u32 m,
+			   struct i2o_message *msg)
+{
+	struct request *req;
+	int error = 0;
+
+	req = i2o_cntxt_list_get(c, le32_to_cpu(msg->u.s.tcntxt));
+	if (unlikely(!req)) {
+		osm_err("NULL reply received!\n");
+		return -1;
+	}
+
+	/*
+	 *      Lets see what is cooking. We stuffed the
+	 *      request in the context.
+	 */
+
+	if ((le32_to_cpu(msg->body[0]) >> 24) != 0) {
+		u32 status = le32_to_cpu(msg->body[0]);
+		/*
+		 *      Device not ready means two things. One is that the
+		 *      the thing went offline (but not a removal media)
+		 *
+		 *      The second is that you have a SuperTrak 100 and the
+		 *      firmware got constipated. Unlike standard i2o card
+		 *      setups the supertrak returns an error rather than
+		 *      blocking for the timeout in these cases.
+		 *
+		 *      Don't stick a supertrak100 into cache aggressive modes
+		 */
+
+		osm_err("TID %03x error status: 0x%02x, detailed status: "
+			"0x%04x\n", (le32_to_cpu(msg->u.head[1]) >> 12 & 0xfff),
+			status >> 24, status & 0xffff);
+
+		req->errors++;
+
+		error = -EIO;
+	}
+
+	i2o_block_end_request(req, error, le32_to_cpu(msg->body[1]));
+
+	return 1;
+};
+
+static void i2o_block_event(struct work_struct *work)
+{
+	struct i2o_event *evt = container_of(work, struct i2o_event, work);
+	osm_debug("event received\n");
+	kfree(evt);
+};
+
+/*
+ *	SCSI-CAM for ioctl geometry mapping
+ *	Duplicated with SCSI - this should be moved into somewhere common
+ *	perhaps genhd ?
+ *
+ * LBA -> CHS mapping table taken from:
+ *
+ * "Incorporating the I2O Architecture into BIOS for Intel Architecture
+ *  Platforms"
+ *
+ * This is an I2O document that is only available to I2O members,
+ * not developers.
+ *
+ * From my understanding, this is how all the I2O cards do this
+ *
+ * Disk Size      | Sectors | Heads | Cylinders
+ * ---------------+---------+-------+-------------------
+ * 1 < X <= 528M  | 63      | 16    | X/(63 * 16 * 512)
+ * 528M < X <= 1G | 63      | 32    | X/(63 * 32 * 512)
+ * 1 < X <528M    | 63      | 16    | X/(63 * 16 * 512)
+ * 1 < X <528M    | 63      | 16    | X/(63 * 16 * 512)
+ *
+ */
+#define	BLOCK_SIZE_528M		1081344
+#define	BLOCK_SIZE_1G		2097152
+#define	BLOCK_SIZE_21G		4403200
+#define	BLOCK_SIZE_42G		8806400
+#define	BLOCK_SIZE_84G		17612800
+
+static void i2o_block_biosparam(unsigned long capacity, unsigned short *cyls,
+				unsigned char *hds, unsigned char *secs)
+{
+	unsigned long heads, sectors, cylinders;
+
+	sectors = 63L;		/* Maximize sectors per track */
+	if (capacity <= BLOCK_SIZE_528M)
+		heads = 16;
+	else if (capacity <= BLOCK_SIZE_1G)
+		heads = 32;
+	else if (capacity <= BLOCK_SIZE_21G)
+		heads = 64;
+	else if (capacity <= BLOCK_SIZE_42G)
+		heads = 128;
+	else
+		heads = 255;
+
+	cylinders = (unsigned long)capacity / (heads * sectors);
+
+	*cyls = (unsigned short)cylinders;	/* Stuff return values */
+	*secs = (unsigned char)sectors;
+	*hds = (unsigned char)heads;
+}
+
+/**
+ *	i2o_block_open - Open the block device
+ *	@bdev: block device being opened
+ *	@mode: file open mode
+ *
+ *	Power up the device, mount and lock the media. This function is called,
+ *	if the block device is opened for access.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_open(struct block_device *bdev, fmode_t mode)
+{
+	struct i2o_block_device *dev = bdev->bd_disk->private_data;
+
+	if (!dev->i2o_dev)
+		return -ENODEV;
+
+	mutex_lock(&i2o_block_mutex);
+	if (dev->power > 0x1f)
+		i2o_block_device_power(dev, 0x02);
+
+	i2o_block_device_mount(dev->i2o_dev, -1);
+
+	i2o_block_device_lock(dev->i2o_dev, -1);
+
+	osm_debug("Ready.\n");
+	mutex_unlock(&i2o_block_mutex);
+
+	return 0;
+};
+
+/**
+ *	i2o_block_release - Release the I2O block device
+ *	@disk: gendisk device being released
+ *	@mode: file open mode
+ *
+ *	Unlock and unmount the media, and power down the device. Gets called if
+ *	the block device is closed.
+ */
+static void i2o_block_release(struct gendisk *disk, fmode_t mode)
+{
+	struct i2o_block_device *dev = disk->private_data;
+	u8 operation;
+
+	/*
+	 * This is to deal with the case of an application
+	 * opening a device and then the device disappears while
+	 * it's in use, and then the application tries to release
+	 * it.  ex: Unmounting a deleted RAID volume at reboot.
+	 * If we send messages, it will just cause FAILs since
+	 * the TID no longer exists.
+	 */
+	if (!dev->i2o_dev)
+		return;
+
+	mutex_lock(&i2o_block_mutex);
+	i2o_block_device_flush(dev->i2o_dev);
+
+	i2o_block_device_unlock(dev->i2o_dev, -1);
+
+	if (dev->flags & (1 << 3 | 1 << 4))	/* Removable */
+		operation = 0x21;
+	else
+		operation = 0x24;
+
+	i2o_block_device_power(dev, operation);
+	mutex_unlock(&i2o_block_mutex);
+}
+
+static int i2o_block_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	i2o_block_biosparam(get_capacity(bdev->bd_disk),
+			    &geo->cylinders, &geo->heads, &geo->sectors);
+	return 0;
+}
+
+/**
+ *	i2o_block_ioctl - Issue device specific ioctl calls.
+ *	@bdev: block device being opened
+ *	@mode: file open mode
+ *	@cmd: ioctl command
+ *	@arg: arg
+ *
+ *	Handles ioctl request for the block device.
+ *
+ *	Return 0 on success or negative error on failure.
+ */
+static int i2o_block_ioctl(struct block_device *bdev, fmode_t mode,
+			   unsigned int cmd, unsigned long arg)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct i2o_block_device *dev = disk->private_data;
+	int ret = -ENOTTY;
+
+	/* Anyone capable of this syscall can do *real bad* things */
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&i2o_block_mutex);
+	switch (cmd) {
+	case BLKI2OGRSTRAT:
+		ret = put_user(dev->rcache, (int __user *)arg);
+		break;
+	case BLKI2OGWSTRAT:
+		ret = put_user(dev->wcache, (int __user *)arg);
+		break;
+	case BLKI2OSRSTRAT:
+		ret = -EINVAL;
+		if (arg < 0 || arg > CACHE_SMARTFETCH)
+			break;
+		dev->rcache = arg;
+		ret = 0;
+		break;
+	case BLKI2OSWSTRAT:
+		ret = -EINVAL;
+		if (arg != 0
+		    && (arg < CACHE_WRITETHROUGH || arg > CACHE_SMARTBACK))
+			break;
+		dev->wcache = arg;
+		ret = 0;
+		break;
+	}
+	mutex_unlock(&i2o_block_mutex);
+
+	return ret;
+};
+
+/**
+ *	i2o_block_check_events - Have we seen a media change?
+ *	@disk: gendisk which should be verified
+ *	@clearing: events being cleared
+ *
+ *	Verifies if the media has changed.
+ *
+ *	Returns 1 if the media was changed or 0 otherwise.
+ */
+static unsigned int i2o_block_check_events(struct gendisk *disk,
+					   unsigned int clearing)
+{
+	struct i2o_block_device *p = disk->private_data;
+
+	if (p->media_change_flag) {
+		p->media_change_flag = 0;
+		return DISK_EVENT_MEDIA_CHANGE;
+	}
+	return 0;
+}
+
+/**
+ *	i2o_block_transfer - Transfer a request to/from the I2O controller
+ *	@req: the request which should be transferred
+ *
+ *	This function converts the request into a I2O message. The necessary
+ *	DMA buffers are allocated and after everything is setup post the message
+ *	to the I2O controller. No cleanup is done by this function. It is done
+ *	on the interrupt side when the reply arrives.
+ *
+ *	Return 0 on success or negative error code on failure.
+ */
+static int i2o_block_transfer(struct request *req)
+{
+	struct i2o_block_device *dev = req->rq_disk->private_data;
+	struct i2o_controller *c;
+	u32 tid;
+	struct i2o_message *msg;
+	u32 *mptr;
+	struct i2o_block_request *ireq = req->special;
+	u32 tcntxt;
+	u32 sgl_offset = SGL_OFFSET_8;
+	u32 ctl_flags = 0x00000000;
+	int rc;
+	u32 cmd;
+
+	if (unlikely(!dev->i2o_dev)) {
+		osm_err("transfer to removed drive\n");
+		rc = -ENODEV;
+		goto exit;
+	}
+
+	tid = dev->i2o_dev->lct_data.tid;
+	c = dev->i2o_dev->iop;
+
+	msg = i2o_msg_get(c);
+	if (IS_ERR(msg)) {
+		rc = PTR_ERR(msg);
+		goto exit;
+	}
+
+	tcntxt = i2o_cntxt_list_add(c, req);
+	if (!tcntxt) {
+		rc = -ENOMEM;
+		goto nop_msg;
+	}
+
+	msg->u.s.icntxt = cpu_to_le32(i2o_block_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(tcntxt);
+
+	mptr = &msg->body[0];
+
+	if (rq_data_dir(req) == READ) {
+		cmd = I2O_CMD_BLOCK_READ << 24;
+
+		switch (dev->rcache) {
+		case CACHE_PREFETCH:
+			ctl_flags = 0x201F0008;
+			break;
+
+		case CACHE_SMARTFETCH:
+			if (blk_rq_sectors(req) > 16)
+				ctl_flags = 0x201F0008;
+			else
+				ctl_flags = 0x001F0000;
+			break;
+
+		default:
+			break;
+		}
+	} else {
+		cmd = I2O_CMD_BLOCK_WRITE << 24;
+
+		switch (dev->wcache) {
+		case CACHE_WRITETHROUGH:
+			ctl_flags = 0x001F0008;
+			break;
+		case CACHE_WRITEBACK:
+			ctl_flags = 0x001F0010;
+			break;
+		case CACHE_SMARTBACK:
+			if (blk_rq_sectors(req) > 16)
+				ctl_flags = 0x001F0004;
+			else
+				ctl_flags = 0x001F0010;
+			break;
+		case CACHE_SMARTTHROUGH:
+			if (blk_rq_sectors(req) > 16)
+				ctl_flags = 0x001F0004;
+			else
+				ctl_flags = 0x001F0010;
+		default:
+			break;
+		}
+	}
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec) {
+		u8 cmd[10];
+		u32 scsi_flags;
+		u16 hwsec;
+
+		hwsec = queue_logical_block_size(req->q) >> KERNEL_SECTOR_SHIFT;
+		memset(cmd, 0, 10);
+
+		sgl_offset = SGL_OFFSET_12;
+
+		msg->u.head[1] =
+		    cpu_to_le32(I2O_CMD_PRIVATE << 24 | HOST_TID << 12 | tid);
+
+		*mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
+		*mptr++ = cpu_to_le32(tid);
+
+		/*
+		 * ENABLE_DISCONNECT
+		 * SIMPLE_TAG
+		 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME
+		 */
+		if (rq_data_dir(req) == READ) {
+			cmd[0] = READ_10;
+			scsi_flags = 0x60a0000a;
+		} else {
+			cmd[0] = WRITE_10;
+			scsi_flags = 0xa0a0000a;
+		}
+
+		*mptr++ = cpu_to_le32(scsi_flags);
+
+		*((u32 *) & cmd[2]) = cpu_to_be32(blk_rq_pos(req) * hwsec);
+		*((u16 *) & cmd[7]) = cpu_to_be16(blk_rq_sectors(req) * hwsec);
+
+		memcpy(mptr, cmd, 10);
+		mptr += 4;
+		*mptr++ = cpu_to_le32(blk_rq_bytes(req));
+	} else
+#endif
+	{
+		msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
+		*mptr++ = cpu_to_le32(ctl_flags);
+		*mptr++ = cpu_to_le32(blk_rq_bytes(req));
+		*mptr++ =
+		    cpu_to_le32((u32) (blk_rq_pos(req) << KERNEL_SECTOR_SHIFT));
+		*mptr++ =
+		    cpu_to_le32(blk_rq_pos(req) >> (32 - KERNEL_SECTOR_SHIFT));
+	}
+
+	if (!i2o_block_sglist_alloc(c, ireq, &mptr)) {
+		rc = -ENOMEM;
+		goto context_remove;
+	}
+
+	msg->u.head[0] =
+	    cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
+
+	list_add_tail(&ireq->queue, &dev->open_queue);
+	dev->open_queue_depth++;
+
+	i2o_msg_post(c, msg);
+
+	return 0;
+
+      context_remove:
+	i2o_cntxt_list_remove(c, req);
+
+      nop_msg:
+	i2o_msg_nop(c, msg);
+
+      exit:
+	return rc;
+};
+
+/**
+ *	i2o_block_request_fn - request queue handling function
+ *	@q: request queue from which the request could be fetched
+ *
+ *	Takes the next request from the queue, transfers it and if no error
+ *	occurs dequeue it from the queue. On arrival of the reply the message
+ *	will be processed further. If an error occurs requeue the request.
+ */
+static void i2o_block_request_fn(struct request_queue *q)
+{
+	struct request *req;
+
+	while ((req = blk_peek_request(q)) != NULL) {
+		if (req->cmd_type == REQ_TYPE_FS) {
+			struct i2o_block_delayed_request *dreq;
+			struct i2o_block_request *ireq = req->special;
+			unsigned int queue_depth;
+
+			queue_depth = ireq->i2o_blk_dev->open_queue_depth;
+
+			if (queue_depth < I2O_BLOCK_MAX_OPEN_REQUESTS) {
+				if (!i2o_block_transfer(req)) {
+					blk_start_request(req);
+					continue;
+				} else
+					osm_info("transfer error\n");
+			}
+
+			if (queue_depth)
+				break;
+
+			/* stop the queue and retry later */
+			dreq = kmalloc(sizeof(*dreq), GFP_ATOMIC);
+			if (!dreq)
+				continue;
+
+			dreq->queue = q;
+			INIT_DELAYED_WORK(&dreq->work,
+					  i2o_block_delayed_request_fn);
+
+			if (!queue_delayed_work(i2o_block_driver.event_queue,
+						&dreq->work,
+						I2O_BLOCK_RETRY_TIME))
+				kfree(dreq);
+			else {
+				blk_stop_queue(q);
+				break;
+			}
+		} else {
+			blk_start_request(req);
+			__blk_end_request_all(req, -EIO);
+		}
+	}
+};
+
+/* I2O Block device operations definition */
+static const struct block_device_operations i2o_block_fops = {
+	.owner = THIS_MODULE,
+	.open = i2o_block_open,
+	.release = i2o_block_release,
+	.ioctl = i2o_block_ioctl,
+	.compat_ioctl = i2o_block_ioctl,
+	.getgeo = i2o_block_getgeo,
+	.check_events = i2o_block_check_events,
+};
+
+/**
+ *	i2o_block_device_alloc - Allocate memory for a I2O Block device
+ *
+ *	Allocate memory for the i2o_block_device struct, gendisk and request
+ *	queue and initialize them as far as no additional information is needed.
+ *
+ *	Returns a pointer to the allocated I2O Block device on success or a
+ *	negative error code on failure.
+ */
+static struct i2o_block_device *i2o_block_device_alloc(void)
+{
+	struct i2o_block_device *dev;
+	struct gendisk *gd;
+	struct request_queue *queue;
+	int rc;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev) {
+		osm_err("Insufficient memory to allocate I2O Block disk.\n");
+		rc = -ENOMEM;
+		goto exit;
+	}
+
+	INIT_LIST_HEAD(&dev->open_queue);
+	spin_lock_init(&dev->lock);
+	dev->rcache = CACHE_PREFETCH;
+	dev->wcache = CACHE_WRITEBACK;
+
+	/* allocate a gendisk with 16 partitions */
+	gd = alloc_disk(16);
+	if (!gd) {
+		osm_err("Insufficient memory to allocate gendisk.\n");
+		rc = -ENOMEM;
+		goto cleanup_dev;
+	}
+
+	/* initialize the request queue */
+	queue = blk_init_queue(i2o_block_request_fn, &dev->lock);
+	if (!queue) {
+		osm_err("Insufficient memory to allocate request queue.\n");
+		rc = -ENOMEM;
+		goto cleanup_queue;
+	}
+
+	blk_queue_prep_rq(queue, i2o_block_prep_req_fn);
+
+	gd->major = I2O_MAJOR;
+	gd->queue = queue;
+	gd->fops = &i2o_block_fops;
+	gd->private_data = dev;
+
+	dev->gd = gd;
+
+	return dev;
+
+      cleanup_queue:
+	put_disk(gd);
+
+      cleanup_dev:
+	kfree(dev);
+
+      exit:
+	return ERR_PTR(rc);
+};
+
+/**
+ *	i2o_block_probe - verify if dev is a I2O Block device and install it
+ *	@dev: device to verify if it is a I2O Block device
+ *
+ *	We only verify if the user_tid of the device is 0xfff and then install
+ *	the device. Otherwise it is used by some other device (e. g. RAID).
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_probe(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+	struct i2o_controller *c = i2o_dev->iop;
+	struct i2o_block_device *i2o_blk_dev;
+	struct gendisk *gd;
+	struct request_queue *queue;
+	static int unit = 0;
+	int rc;
+	u64 size;
+	u32 blocksize;
+	u16 body_size = 4;
+	u16 power;
+	unsigned short max_sectors;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec)
+		body_size = 8;
+#endif
+
+	if (c->limit_sectors)
+		max_sectors = I2O_MAX_SECTORS_LIMITED;
+	else
+		max_sectors = I2O_MAX_SECTORS;
+
+	/* skip devices which are used by IOP */
+	if (i2o_dev->lct_data.user_tid != 0xfff) {
+		osm_debug("skipping used device %03x\n", i2o_dev->lct_data.tid);
+		return -ENODEV;
+	}
+
+	if (i2o_device_claim(i2o_dev)) {
+		osm_warn("Unable to claim device. Installation aborted\n");
+		rc = -EFAULT;
+		goto exit;
+	}
+
+	i2o_blk_dev = i2o_block_device_alloc();
+	if (IS_ERR(i2o_blk_dev)) {
+		osm_err("could not alloc a new I2O block device");
+		rc = PTR_ERR(i2o_blk_dev);
+		goto claim_release;
+	}
+
+	i2o_blk_dev->i2o_dev = i2o_dev;
+	dev_set_drvdata(dev, i2o_blk_dev);
+
+	/* setup gendisk */
+	gd = i2o_blk_dev->gd;
+	gd->first_minor = unit << 4;
+	sprintf(gd->disk_name, "i2o/hd%c", 'a' + unit);
+	gd->driverfs_dev = &i2o_dev->device;
+
+	/* setup request queue */
+	queue = gd->queue;
+	queue->queuedata = i2o_blk_dev;
+
+	blk_queue_max_hw_sectors(queue, max_sectors);
+	blk_queue_max_segments(queue, i2o_sg_tablesize(c, body_size));
+
+	osm_debug("max sectors = %d\n", queue->max_sectors);
+	osm_debug("phys segments = %d\n", queue->max_phys_segments);
+	osm_debug("max hw segments = %d\n", queue->max_hw_segments);
+
+	/*
+	 *      Ask for the current media data. If that isn't supported
+	 *      then we ask for the device capacity data
+	 */
+	if (!i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) ||
+	    !i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
+		blk_queue_logical_block_size(queue, le32_to_cpu(blocksize));
+	} else
+		osm_warn("unable to get blocksize of %s\n", gd->disk_name);
+
+	if (!i2o_parm_field_get(i2o_dev, 0x0004, 0, &size, 8) ||
+	    !i2o_parm_field_get(i2o_dev, 0x0000, 4, &size, 8)) {
+		set_capacity(gd, le64_to_cpu(size) >> KERNEL_SECTOR_SHIFT);
+	} else
+		osm_warn("could not get size of %s\n", gd->disk_name);
+
+	if (!i2o_parm_field_get(i2o_dev, 0x0000, 2, &power, 2))
+		i2o_blk_dev->power = power;
+
+	i2o_event_register(i2o_dev, &i2o_block_driver, 0, 0xffffffff);
+
+	add_disk(gd);
+
+	unit++;
+
+	osm_info("device added (TID: %03x): %s\n", i2o_dev->lct_data.tid,
+		 i2o_blk_dev->gd->disk_name);
+
+	return 0;
+
+      claim_release:
+	i2o_device_claim_release(i2o_dev);
+
+      exit:
+	return rc;
+};
+
+/* Block OSM driver struct */
+static struct i2o_driver i2o_block_driver = {
+	.name = OSM_NAME,
+	.event = i2o_block_event,
+	.reply = i2o_block_reply,
+	.classes = i2o_block_class_id,
+	.driver = {
+		   .probe = i2o_block_probe,
+		   .remove = i2o_block_remove,
+		   },
+};
+
+/**
+ *	i2o_block_init - Block OSM initialization function
+ *
+ *	Allocate the slab and mempool for request structs, registers i2o_block
+ *	block device and finally register the Block OSM in the I2O core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_block_init(void)
+{
+	int rc;
+	int size;
+
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	/* Allocate request mempool and slab */
+	size = sizeof(struct i2o_block_request);
+	i2o_blk_req_pool.slab = kmem_cache_create("i2o_block_req", size, 0,
+						  SLAB_HWCACHE_ALIGN, NULL);
+	if (!i2o_blk_req_pool.slab) {
+		osm_err("can't init request slab\n");
+		rc = -ENOMEM;
+		goto exit;
+	}
+
+	i2o_blk_req_pool.pool =
+		mempool_create_slab_pool(I2O_BLOCK_REQ_MEMPOOL_SIZE,
+					 i2o_blk_req_pool.slab);
+	if (!i2o_blk_req_pool.pool) {
+		osm_err("can't init request mempool\n");
+		rc = -ENOMEM;
+		goto free_slab;
+	}
+
+	/* Register the block device interfaces */
+	rc = register_blkdev(I2O_MAJOR, "i2o_block");
+	if (rc) {
+		osm_err("unable to register block device\n");
+		goto free_mempool;
+	}
+#ifdef MODULE
+	osm_info("registered device at major %d\n", I2O_MAJOR);
+#endif
+
+	/* Register Block OSM into I2O core */
+	rc = i2o_driver_register(&i2o_block_driver);
+	if (rc) {
+		osm_err("Could not register Block driver\n");
+		goto unregister_blkdev;
+	}
+
+	return 0;
+
+      unregister_blkdev:
+	unregister_blkdev(I2O_MAJOR, "i2o_block");
+
+      free_mempool:
+	mempool_destroy(i2o_blk_req_pool.pool);
+
+      free_slab:
+	kmem_cache_destroy(i2o_blk_req_pool.slab);
+
+      exit:
+	return rc;
+};
+
+/**
+ *	i2o_block_exit - Block OSM exit function
+ *
+ *	Unregisters Block OSM from I2O core, unregisters i2o_block block device
+ *	and frees the mempool and slab.
+ */
+static void __exit i2o_block_exit(void)
+{
+	/* Unregister I2O Block OSM from I2O core */
+	i2o_driver_unregister(&i2o_block_driver);
+
+	/* Unregister block device */
+	unregister_blkdev(I2O_MAJOR, "i2o_block");
+
+	/* Free request mempool and slab */
+	mempool_destroy(i2o_blk_req_pool.pool);
+	kmem_cache_destroy(i2o_blk_req_pool.slab);
+};
+
+MODULE_AUTHOR("Red Hat");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+module_init(i2o_block_init);
+module_exit(i2o_block_exit);
diff --git a/drivers/staging/i2o/i2o_block.h b/drivers/staging/i2o/i2o_block.h
new file mode 100644
index 000000000000..cf8873cbca3f
--- /dev/null
+++ b/drivers/staging/i2o/i2o_block.h
@@ -0,0 +1,103 @@
+/*
+ *	Block OSM structures/API
+ *
+ * 	Copyright (C) 1999-2002	Red Hat Software
+ *
+ *	Written by Alan Cox, Building Number Three Ltd
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	This program is distributed in the hope that it will be useful, but
+ *	WITHOUT ANY WARRANTY; without even the implied warranty of
+ *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *	General Public License for more details.
+ *
+ *	For the purpose of avoiding doubt the preferred form of the work
+ *	for making modifications shall be a standards compliant form such
+ *	gzipped tar and not one requiring a proprietary or patent encumbered
+ *	tool to unpack.
+ *
+ *	Fixes/additions:
+ *		Steve Ralston:
+ *			Multiple device handling error fixes,
+ *			Added a queue depth.
+ *		Alan Cox:
+ *			FC920 has an rmw bug. Dont or in the end marker.
+ *			Removed queue walk, fixed for 64bitness.
+ *			Rewrote much of the code over time
+ *			Added indirect block lists
+ *			Handle 64K limits on many controllers
+ *			Don't use indirects on the Promise (breaks)
+ *			Heavily chop down the queue depths
+ *		Deepak Saxena:
+ *			Independent queues per IOP
+ *			Support for dynamic device creation/deletion
+ *			Code cleanup
+ *	    		Support for larger I/Os through merge* functions
+ *			(taken from DAC960 driver)
+ *		Boji T Kannanthanam:
+ *			Set the I2O Block devices to be detected in increasing
+ *			order of TIDs during boot.
+ *			Search and set the I2O block device that we boot off
+ *			from as the first device to be claimed (as /dev/i2o/hda)
+ *			Properly attach/detach I2O gendisk structure from the
+ *			system gendisk list. The I2O block devices now appear in
+ *			/proc/partitions.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Minor bugfixes for 2.6.
+ */
+
+#ifndef I2O_BLOCK_OSM_H
+#define I2O_BLOCK_OSM_H
+
+#define I2O_BLOCK_RETRY_TIME HZ/4
+#define I2O_BLOCK_MAX_OPEN_REQUESTS 50
+
+/* request queue sizes */
+#define I2O_BLOCK_REQ_MEMPOOL_SIZE		32
+
+#define KERNEL_SECTOR_SHIFT 9
+#define KERNEL_SECTOR_SIZE (1 << KERNEL_SECTOR_SHIFT)
+
+/* I2O Block OSM mempool struct */
+struct i2o_block_mempool {
+	struct kmem_cache *slab;
+	mempool_t *pool;
+};
+
+/* I2O Block device descriptor */
+struct i2o_block_device {
+	struct i2o_device *i2o_dev;	/* pointer to I2O device */
+	struct gendisk *gd;
+	spinlock_t lock;	/* queue lock */
+	struct list_head open_queue;	/* list of transferred, but unfinished
+					   requests */
+	unsigned int open_queue_depth;	/* number of requests in the queue */
+
+	int rcache;		/* read cache flags */
+	int wcache;		/* write cache flags */
+	int flags;
+	u16 power;		/* power state */
+	int media_change_flag;	/* media changed flag */
+};
+
+/* I2O Block device request */
+struct i2o_block_request {
+	struct list_head queue;
+	struct request *req;	/* corresponding request */
+	struct i2o_block_device *i2o_blk_dev;	/* I2O block device */
+	struct device *dev;	/* device used for DMA */
+	int sg_nents;		/* number of SG elements */
+	struct scatterlist sg_table[I2O_MAX_PHYS_SEGMENTS];	/* SG table */
+};
+
+/* I2O Block device delayed request */
+struct i2o_block_delayed_request {
+	struct delayed_work work;
+	struct request_queue *queue;
+};
+
+#endif
diff --git a/drivers/staging/i2o/i2o_config.c b/drivers/staging/i2o/i2o_config.c
new file mode 100644
index 000000000000..04bd3b6de401
--- /dev/null
+++ b/drivers/staging/i2o/i2o_config.c
@@ -0,0 +1,1163 @@
+/*
+ * I2O Configuration Interface Driver
+ *
+ * (C) Copyright 1999-2002  Red Hat
+ *
+ * Written by Alan Cox, Building Number Three Ltd
+ *
+ * Fixes/additions:
+ *	Deepak Saxena (04/20/1999):
+ *		Added basic ioctl() support
+ *	Deepak Saxena (06/07/1999):
+ *		Added software download ioctl (still testing)
+ *	Auvo Häkkinen (09/10/1999):
+ *		Changes to i2o_cfg_reply(), ioctl_parms()
+ *		Added ioct_validate()
+ *	Taneli Vähäkangas (09/30/1999):
+ *		Fixed ioctl_swdl()
+ *	Taneli Vähäkangas (10/04/1999):
+ *		Changed ioctl_swdl(), implemented ioctl_swul() and ioctl_swdel()
+ *	Deepak Saxena (11/18/1999):
+ *		Added event managmenet support
+ *	Alan Cox <alan@lxorguk.ukuu.org.uk>:
+ *		2.4 rewrite ported to 2.5
+ *	Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *		Added pass-thru support for Adaptec's raidutils
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/compat.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include "core.h"
+
+#define SG_TABLESIZE		30
+
+static DEFINE_MUTEX(i2o_cfg_mutex);
+static long i2o_cfg_ioctl(struct file *, unsigned int, unsigned long);
+
+static spinlock_t i2o_config_lock;
+
+#define MODINC(x,y) ((x) = ((x) + 1) % (y))
+
+struct sg_simple_element {
+	u32 flag_count;
+	u32 addr_bus;
+};
+
+struct i2o_cfg_info {
+	struct file *fp;
+	struct fasync_struct *fasync;
+	struct i2o_evt_info event_q[I2O_EVT_Q_LEN];
+	u16 q_in;		// Queue head index
+	u16 q_out;		// Queue tail index
+	u16 q_len;		// Queue length
+	u16 q_lost;		// Number of lost events
+	ulong q_id;		// Event queue ID...used as tx_context
+	struct i2o_cfg_info *next;
+};
+static struct i2o_cfg_info *open_files = NULL;
+static ulong i2o_cfg_info_id = 0;
+
+static int i2o_cfg_getiops(unsigned long arg)
+{
+	struct i2o_controller *c;
+	u8 __user *user_iop_table = (void __user *)arg;
+	u8 tmp[MAX_I2O_CONTROLLERS];
+	int ret = 0;
+
+	memset(tmp, 0, MAX_I2O_CONTROLLERS);
+
+	list_for_each_entry(c, &i2o_controllers, list)
+	    tmp[c->unit] = 1;
+
+	if (copy_to_user(user_iop_table, tmp, MAX_I2O_CONTROLLERS))
+		ret = -EFAULT;
+
+	return ret;
+};
+
+static int i2o_cfg_gethrt(unsigned long arg)
+{
+	struct i2o_controller *c;
+	struct i2o_cmd_hrtlct __user *cmd = (struct i2o_cmd_hrtlct __user *)arg;
+	struct i2o_cmd_hrtlct kcmd;
+	i2o_hrt *hrt;
+	int len;
+	u32 reslen;
+	int ret = 0;
+
+	if (copy_from_user(&kcmd, cmd, sizeof(struct i2o_cmd_hrtlct)))
+		return -EFAULT;
+
+	if (get_user(reslen, kcmd.reslen) < 0)
+		return -EFAULT;
+
+	if (kcmd.resbuf == NULL)
+		return -EFAULT;
+
+	c = i2o_find_iop(kcmd.iop);
+	if (!c)
+		return -ENXIO;
+
+	hrt = (i2o_hrt *) c->hrt.virt;
+
+	len = 8 + ((hrt->entry_len * hrt->num_entries) << 2);
+
+	if (put_user(len, kcmd.reslen))
+		ret = -EFAULT;
+	else if (len > reslen)
+		ret = -ENOBUFS;
+	else if (copy_to_user(kcmd.resbuf, (void *)hrt, len))
+		ret = -EFAULT;
+
+	return ret;
+};
+
+static int i2o_cfg_getlct(unsigned long arg)
+{
+	struct i2o_controller *c;
+	struct i2o_cmd_hrtlct __user *cmd = (struct i2o_cmd_hrtlct __user *)arg;
+	struct i2o_cmd_hrtlct kcmd;
+	i2o_lct *lct;
+	int len;
+	int ret = 0;
+	u32 reslen;
+
+	if (copy_from_user(&kcmd, cmd, sizeof(struct i2o_cmd_hrtlct)))
+		return -EFAULT;
+
+	if (get_user(reslen, kcmd.reslen) < 0)
+		return -EFAULT;
+
+	if (kcmd.resbuf == NULL)
+		return -EFAULT;
+
+	c = i2o_find_iop(kcmd.iop);
+	if (!c)
+		return -ENXIO;
+
+	lct = (i2o_lct *) c->lct;
+
+	len = (unsigned int)lct->table_size << 2;
+	if (put_user(len, kcmd.reslen))
+		ret = -EFAULT;
+	else if (len > reslen)
+		ret = -ENOBUFS;
+	else if (copy_to_user(kcmd.resbuf, lct, len))
+		ret = -EFAULT;
+
+	return ret;
+};
+
+static int i2o_cfg_parms(unsigned long arg, unsigned int type)
+{
+	int ret = 0;
+	struct i2o_controller *c;
+	struct i2o_device *dev;
+	struct i2o_cmd_psetget __user *cmd =
+	    (struct i2o_cmd_psetget __user *)arg;
+	struct i2o_cmd_psetget kcmd;
+	u32 reslen;
+	u8 *ops;
+	u8 *res;
+	int len = 0;
+
+	u32 i2o_cmd = (type == I2OPARMGET ?
+		       I2O_CMD_UTIL_PARAMS_GET : I2O_CMD_UTIL_PARAMS_SET);
+
+	if (copy_from_user(&kcmd, cmd, sizeof(struct i2o_cmd_psetget)))
+		return -EFAULT;
+
+	if (get_user(reslen, kcmd.reslen))
+		return -EFAULT;
+
+	c = i2o_find_iop(kcmd.iop);
+	if (!c)
+		return -ENXIO;
+
+	dev = i2o_iop_find_device(c, kcmd.tid);
+	if (!dev)
+		return -ENXIO;
+
+	/*
+	 * Stop users being able to try and allocate arbitrary amounts
+	 * of DMA space. 64K is way more than sufficient for this.
+	 */
+	if (kcmd.oplen > 65536)
+		return -EMSGSIZE;
+
+	ops = memdup_user(kcmd.opbuf, kcmd.oplen);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+
+	/*
+	 * It's possible to have a _very_ large table
+	 * and that the user asks for all of it at once...
+	 */
+	res = kmalloc(65536, GFP_KERNEL);
+	if (!res) {
+		kfree(ops);
+		return -ENOMEM;
+	}
+
+	len = i2o_parm_issue(dev, i2o_cmd, ops, kcmd.oplen, res, 65536);
+	kfree(ops);
+
+	if (len < 0) {
+		kfree(res);
+		return -EAGAIN;
+	}
+
+	if (put_user(len, kcmd.reslen))
+		ret = -EFAULT;
+	else if (len > reslen)
+		ret = -ENOBUFS;
+	else if (copy_to_user(kcmd.resbuf, res, len))
+		ret = -EFAULT;
+
+	kfree(res);
+
+	return ret;
+};
+
+static int i2o_cfg_swdl(unsigned long arg)
+{
+	struct i2o_sw_xfer kxfer;
+	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
+	unsigned char maxfrag = 0, curfrag = 1;
+	struct i2o_dma buffer;
+	struct i2o_message *msg;
+	unsigned int status = 0, swlen = 0, fragsize = 8192;
+	struct i2o_controller *c;
+
+	if (copy_from_user(&kxfer, pxfer, sizeof(struct i2o_sw_xfer)))
+		return -EFAULT;
+
+	if (get_user(swlen, kxfer.swlen) < 0)
+		return -EFAULT;
+
+	if (get_user(maxfrag, kxfer.maxfrag) < 0)
+		return -EFAULT;
+
+	if (get_user(curfrag, kxfer.curfrag) < 0)
+		return -EFAULT;
+
+	if (curfrag == maxfrag)
+		fragsize = swlen - (maxfrag - 1) * 8192;
+
+	if (!kxfer.buf || !access_ok(VERIFY_READ, kxfer.buf, fragsize))
+		return -EFAULT;
+
+	c = i2o_find_iop(kxfer.iop);
+	if (!c)
+		return -ENXIO;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize)) {
+		i2o_msg_nop(c, msg);
+		return -ENOMEM;
+	}
+
+	if (__copy_from_user(buffer.virt, kxfer.buf, fragsize)) {
+		i2o_msg_nop(c, msg);
+		i2o_dma_free(&c->pdev->dev, &buffer);
+		return -EFAULT;
+	}
+
+	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SW_DOWNLOAD << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+	msg->body[0] =
+	    cpu_to_le32((((u32) kxfer.flags) << 24) | (((u32) kxfer.
+							sw_type) << 16) |
+			(((u32) maxfrag) << 8) | (((u32) curfrag)));
+	msg->body[1] = cpu_to_le32(swlen);
+	msg->body[2] = cpu_to_le32(kxfer.sw_id);
+	msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
+	msg->body[4] = cpu_to_le32(buffer.phys);
+
+	osm_debug("swdl frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
+	status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
+
+	if (status != -ETIMEDOUT)
+		i2o_dma_free(&c->pdev->dev, &buffer);
+
+	if (status != I2O_POST_WAIT_OK) {
+		// it fails if you try and send frags out of order
+		// and for some yet unknown reasons too
+		osm_info("swdl failed, DetailedStatus = %d\n", status);
+		return status;
+	}
+
+	return 0;
+};
+
+static int i2o_cfg_swul(unsigned long arg)
+{
+	struct i2o_sw_xfer kxfer;
+	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
+	unsigned char maxfrag = 0, curfrag = 1;
+	struct i2o_dma buffer;
+	struct i2o_message *msg;
+	unsigned int status = 0, swlen = 0, fragsize = 8192;
+	struct i2o_controller *c;
+	int ret = 0;
+
+	if (copy_from_user(&kxfer, pxfer, sizeof(struct i2o_sw_xfer)))
+		return -EFAULT;
+
+	if (get_user(swlen, kxfer.swlen) < 0)
+		return -EFAULT;
+
+	if (get_user(maxfrag, kxfer.maxfrag) < 0)
+		return -EFAULT;
+
+	if (get_user(curfrag, kxfer.curfrag) < 0)
+		return -EFAULT;
+
+	if (curfrag == maxfrag)
+		fragsize = swlen - (maxfrag - 1) * 8192;
+
+	if (!kxfer.buf)
+		return -EFAULT;
+
+	c = i2o_find_iop(kxfer.iop);
+	if (!c)
+		return -ENXIO;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize)) {
+		i2o_msg_nop(c, msg);
+		return -ENOMEM;
+	}
+
+	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SW_UPLOAD << 24 | HOST_TID << 12 | ADAPTER_TID);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+	msg->body[0] =
+	    cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.
+			sw_type << 16 | (u32) maxfrag << 8 | (u32) curfrag);
+	msg->body[1] = cpu_to_le32(swlen);
+	msg->body[2] = cpu_to_le32(kxfer.sw_id);
+	msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
+	msg->body[4] = cpu_to_le32(buffer.phys);
+
+	osm_debug("swul frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
+	status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
+
+	if (status != I2O_POST_WAIT_OK) {
+		if (status != -ETIMEDOUT)
+			i2o_dma_free(&c->pdev->dev, &buffer);
+
+		osm_info("swul failed, DetailedStatus = %d\n", status);
+		return status;
+	}
+
+	if (copy_to_user(kxfer.buf, buffer.virt, fragsize))
+		ret = -EFAULT;
+
+	i2o_dma_free(&c->pdev->dev, &buffer);
+
+	return ret;
+}
+
+static int i2o_cfg_swdel(unsigned long arg)
+{
+	struct i2o_controller *c;
+	struct i2o_sw_xfer kxfer;
+	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
+	struct i2o_message *msg;
+	unsigned int swlen;
+	int token;
+
+	if (copy_from_user(&kxfer, pxfer, sizeof(struct i2o_sw_xfer)))
+		return -EFAULT;
+
+	if (get_user(swlen, kxfer.swlen) < 0)
+		return -EFAULT;
+
+	c = i2o_find_iop(kxfer.iop);
+	if (!c)
+		return -ENXIO;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(SEVEN_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SW_REMOVE << 24 | HOST_TID << 12 | ADAPTER_TID);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+	msg->body[0] =
+	    cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.sw_type << 16);
+	msg->body[1] = cpu_to_le32(swlen);
+	msg->body[2] = cpu_to_le32(kxfer.sw_id);
+
+	token = i2o_msg_post_wait(c, msg, 10);
+
+	if (token != I2O_POST_WAIT_OK) {
+		osm_info("swdel failed, DetailedStatus = %d\n", token);
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+};
+
+static int i2o_cfg_validate(unsigned long arg)
+{
+	int token;
+	int iop = (int)arg;
+	struct i2o_message *msg;
+	struct i2o_controller *c;
+
+	c = i2o_find_iop(iop);
+	if (!c)
+		return -ENXIO;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_CONFIG_VALIDATE << 24 | HOST_TID << 12 | iop);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+
+	token = i2o_msg_post_wait(c, msg, 10);
+
+	if (token != I2O_POST_WAIT_OK) {
+		osm_info("Can't validate configuration, ErrorStatus = %d\n",
+			 token);
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+};
+
+static int i2o_cfg_evt_reg(unsigned long arg, struct file *fp)
+{
+	struct i2o_message *msg;
+	struct i2o_evt_id __user *pdesc = (struct i2o_evt_id __user *)arg;
+	struct i2o_evt_id kdesc;
+	struct i2o_controller *c;
+	struct i2o_device *d;
+
+	if (copy_from_user(&kdesc, pdesc, sizeof(struct i2o_evt_id)))
+		return -EFAULT;
+
+	/* IOP exists? */
+	c = i2o_find_iop(kdesc.iop);
+	if (!c)
+		return -ENXIO;
+
+	/* Device exists? */
+	d = i2o_iop_find_device(c, kdesc.tid);
+	if (!d)
+		return -ENODEV;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 |
+			kdesc.tid);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(i2o_cntxt_list_add(c, fp->private_data));
+	msg->body[0] = cpu_to_le32(kdesc.evt_mask);
+
+	i2o_msg_post(c, msg);
+
+	return 0;
+}
+
+static int i2o_cfg_evt_get(unsigned long arg, struct file *fp)
+{
+	struct i2o_cfg_info *p = NULL;
+	struct i2o_evt_get __user *uget = (struct i2o_evt_get __user *)arg;
+	struct i2o_evt_get kget;
+	unsigned long flags;
+
+	for (p = open_files; p; p = p->next)
+		if (p->q_id == (ulong) fp->private_data)
+			break;
+
+	if (!p->q_len)
+		return -ENOENT;
+
+	memcpy(&kget.info, &p->event_q[p->q_out], sizeof(struct i2o_evt_info));
+	MODINC(p->q_out, I2O_EVT_Q_LEN);
+	spin_lock_irqsave(&i2o_config_lock, flags);
+	p->q_len--;
+	kget.pending = p->q_len;
+	kget.lost = p->q_lost;
+	spin_unlock_irqrestore(&i2o_config_lock, flags);
+
+	if (copy_to_user(uget, &kget, sizeof(struct i2o_evt_get)))
+		return -EFAULT;
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
+			      unsigned long arg)
+{
+	struct i2o_cmd_passthru32 __user *cmd;
+	struct i2o_controller *c;
+	u32 __user *user_msg;
+	u32 *reply = NULL;
+	u32 __user *user_reply = NULL;
+	u32 size = 0;
+	u32 reply_size = 0;
+	u32 rcode = 0;
+	struct i2o_dma sg_list[SG_TABLESIZE];
+	u32 sg_offset = 0;
+	u32 sg_count = 0;
+	u32 i = 0;
+	u32 sg_index = 0;
+	i2o_status_block *sb;
+	struct i2o_message *msg;
+	unsigned int iop;
+
+	cmd = (struct i2o_cmd_passthru32 __user *)arg;
+
+	if (get_user(iop, &cmd->iop) || get_user(i, &cmd->msg))
+		return -EFAULT;
+
+	user_msg = compat_ptr(i);
+
+	c = i2o_find_iop(iop);
+	if (!c) {
+		osm_debug("controller %d not found\n", iop);
+		return -ENXIO;
+	}
+
+	sb = c->status_block.virt;
+
+	if (get_user(size, &user_msg[0])) {
+		osm_warn("unable to get size!\n");
+		return -EFAULT;
+	}
+	size = size >> 16;
+
+	if (size > sb->inbound_frame_size) {
+		osm_warn("size of message > inbound_frame_size");
+		return -EFAULT;
+	}
+
+	user_reply = &user_msg[size];
+
+	size <<= 2;		// Convert to bytes
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	rcode = -EFAULT;
+	/* Copy in the user's I2O command */
+	if (copy_from_user(msg, user_msg, size)) {
+		osm_warn("unable to copy user message\n");
+		goto out;
+	}
+	i2o_dump_message(msg);
+
+	if (get_user(reply_size, &user_reply[0]) < 0)
+		goto out;
+
+	reply_size >>= 16;
+	reply_size <<= 2;
+
+	rcode = -ENOMEM;
+	reply = kzalloc(reply_size, GFP_KERNEL);
+	if (!reply) {
+		printk(KERN_WARNING "%s: Could not allocate reply buffer\n",
+		       c->name);
+		goto out;
+	}
+
+	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
+
+	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
+	if (sg_offset) {
+		struct sg_simple_element *sg;
+
+		if (sg_offset * 4 >= size) {
+			rcode = -EFAULT;
+			goto cleanup;
+		}
+		// TODO 64bit fix
+		sg = (struct sg_simple_element *)((&msg->u.head[0]) +
+						  sg_offset);
+		sg_count =
+		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
+		if (sg_count > SG_TABLESIZE) {
+			printk(KERN_DEBUG "%s:IOCTL SG List too large (%u)\n",
+			       c->name, sg_count);
+			rcode = -EINVAL;
+			goto cleanup;
+		}
+
+		for (i = 0; i < sg_count; i++) {
+			int sg_size;
+			struct i2o_dma *p;
+
+			if (!(sg[i].flag_count & 0x10000000
+			      /*I2O_SGL_FLAGS_SIMPLE_ADDRESS_ELEMENT */ )) {
+				printk(KERN_DEBUG
+				       "%s:Bad SG element %d - not simple (%x)\n",
+				       c->name, i, sg[i].flag_count);
+				rcode = -EINVAL;
+				goto cleanup;
+			}
+			sg_size = sg[i].flag_count & 0xffffff;
+			p = &(sg_list[sg_index]);
+			/* Allocate memory for the transfer */
+			if (i2o_dma_alloc(&c->pdev->dev, p, sg_size)) {
+				printk(KERN_DEBUG
+				       "%s: Could not allocate SG buffer - size = %d buffer number %d of %d\n",
+				       c->name, sg_size, i, sg_count);
+				rcode = -ENOMEM;
+				goto sg_list_cleanup;
+			}
+			sg_index++;
+			/* Copy in the user's SG buffer if necessary */
+			if (sg[i].
+			    flag_count & 0x04000000 /*I2O_SGL_FLAGS_DIR */ ) {
+				// TODO 64bit fix
+				if (copy_from_user
+				    (p->virt,
+				     (void __user *)(unsigned long)sg[i].
+				     addr_bus, sg_size)) {
+					printk(KERN_DEBUG
+					       "%s: Could not copy SG buf %d FROM user\n",
+					       c->name, i);
+					rcode = -EFAULT;
+					goto sg_list_cleanup;
+				}
+			}
+			//TODO 64bit fix
+			sg[i].addr_bus = (u32) p->phys;
+		}
+	}
+
+	rcode = i2o_msg_post_wait(c, msg, 60);
+	msg = NULL;
+	if (rcode) {
+		reply[4] = ((u32) rcode) << 24;
+		goto sg_list_cleanup;
+	}
+
+	if (sg_offset) {
+		u32 rmsg[I2O_OUTBOUND_MSG_FRAME_SIZE];
+		/* Copy back the Scatter Gather buffers back to user space */
+		u32 j;
+		// TODO 64bit fix
+		struct sg_simple_element *sg;
+		int sg_size;
+
+		// re-acquire the original message to handle correctly the sg copy operation
+		memset(&rmsg, 0, I2O_OUTBOUND_MSG_FRAME_SIZE * 4);
+		// get user msg size in u32s
+		if (get_user(size, &user_msg[0])) {
+			rcode = -EFAULT;
+			goto sg_list_cleanup;
+		}
+		size = size >> 16;
+		size *= 4;
+		if (size > sizeof(rmsg)) {
+			rcode = -EINVAL;
+			goto sg_list_cleanup;
+		}
+
+		/* Copy in the user's I2O command */
+		if (copy_from_user(rmsg, user_msg, size)) {
+			rcode = -EFAULT;
+			goto sg_list_cleanup;
+		}
+		sg_count =
+		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
+
+		// TODO 64bit fix
+		sg = (struct sg_simple_element *)(rmsg + sg_offset);
+		for (j = 0; j < sg_count; j++) {
+			/* Copy out the SG list to user's buffer if necessary */
+			if (!
+			    (sg[j].
+			     flag_count & 0x4000000 /*I2O_SGL_FLAGS_DIR */ )) {
+				sg_size = sg[j].flag_count & 0xffffff;
+				// TODO 64bit fix
+				if (copy_to_user
+				    ((void __user *)(u64) sg[j].addr_bus,
+				     sg_list[j].virt, sg_size)) {
+					printk(KERN_WARNING
+					       "%s: Could not copy %p TO user %x\n",
+					       c->name, sg_list[j].virt,
+					       sg[j].addr_bus);
+					rcode = -EFAULT;
+					goto sg_list_cleanup;
+				}
+			}
+		}
+	}
+
+sg_list_cleanup:
+	/* Copy back the reply to user space */
+	if (reply_size) {
+		// we wrote our own values for context - now restore the user supplied ones
+		if (copy_from_user(reply + 2, user_msg + 2, sizeof(u32) * 2)) {
+			printk(KERN_WARNING
+			       "%s: Could not copy message context FROM user\n",
+			       c->name);
+			rcode = -EFAULT;
+		}
+		if (copy_to_user(user_reply, reply, reply_size)) {
+			printk(KERN_WARNING
+			       "%s: Could not copy reply TO user\n", c->name);
+			rcode = -EFAULT;
+		}
+	}
+	for (i = 0; i < sg_index; i++)
+		i2o_dma_free(&c->pdev->dev, &sg_list[i]);
+
+cleanup:
+	kfree(reply);
+out:
+	if (msg)
+		i2o_msg_nop(c, msg);
+	return rcode;
+}
+
+static long i2o_cfg_compat_ioctl(struct file *file, unsigned cmd,
+				 unsigned long arg)
+{
+	int ret;
+	switch (cmd) {
+	case I2OGETIOPS:
+		ret = i2o_cfg_ioctl(file, cmd, arg);
+		break;
+	case I2OPASSTHRU32:
+		mutex_lock(&i2o_cfg_mutex);
+		ret = i2o_cfg_passthru32(file, cmd, arg);
+		mutex_unlock(&i2o_cfg_mutex);
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+	return ret;
+}
+
+#endif
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+static int i2o_cfg_passthru(unsigned long arg)
+{
+	struct i2o_cmd_passthru __user *cmd =
+	    (struct i2o_cmd_passthru __user *)arg;
+	struct i2o_controller *c;
+	u32 __user *user_msg;
+	u32 *reply = NULL;
+	u32 __user *user_reply = NULL;
+	u32 size = 0;
+	u32 reply_size = 0;
+	u32 rcode = 0;
+	struct i2o_dma sg_list[SG_TABLESIZE];
+	u32 sg_offset = 0;
+	u32 sg_count = 0;
+	int sg_index = 0;
+	u32 i = 0;
+	i2o_status_block *sb;
+	struct i2o_message *msg;
+	unsigned int iop;
+
+	if (get_user(iop, &cmd->iop) || get_user(user_msg, &cmd->msg))
+		return -EFAULT;
+
+	c = i2o_find_iop(iop);
+	if (!c) {
+		osm_warn("controller %d not found\n", iop);
+		return -ENXIO;
+	}
+
+	sb = c->status_block.virt;
+
+	if (get_user(size, &user_msg[0]))
+		return -EFAULT;
+	size = size >> 16;
+
+	if (size > sb->inbound_frame_size) {
+		osm_warn("size of message > inbound_frame_size");
+		return -EFAULT;
+	}
+
+	user_reply = &user_msg[size];
+
+	size <<= 2;		// Convert to bytes
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	rcode = -EFAULT;
+	/* Copy in the user's I2O command */
+	if (copy_from_user(msg, user_msg, size))
+		goto out;
+
+	if (get_user(reply_size, &user_reply[0]) < 0)
+		goto out;
+
+	reply_size >>= 16;
+	reply_size <<= 2;
+
+	reply = kzalloc(reply_size, GFP_KERNEL);
+	if (!reply) {
+		printk(KERN_WARNING "%s: Could not allocate reply buffer\n",
+		       c->name);
+		rcode = -ENOMEM;
+		goto out;
+	}
+
+	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
+
+	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
+	if (sg_offset) {
+		struct sg_simple_element *sg;
+		struct i2o_dma *p;
+
+		if (sg_offset * 4 >= size) {
+			rcode = -EFAULT;
+			goto cleanup;
+		}
+		// TODO 64bit fix
+		sg = (struct sg_simple_element *)((&msg->u.head[0]) +
+						  sg_offset);
+		sg_count =
+		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
+		if (sg_count > SG_TABLESIZE) {
+			printk(KERN_DEBUG "%s:IOCTL SG List too large (%u)\n",
+			       c->name, sg_count);
+			rcode = -EINVAL;
+			goto cleanup;
+		}
+
+		for (i = 0; i < sg_count; i++) {
+			int sg_size;
+
+			if (!(sg[i].flag_count & 0x10000000
+			      /*I2O_SGL_FLAGS_SIMPLE_ADDRESS_ELEMENT */ )) {
+				printk(KERN_DEBUG
+				       "%s:Bad SG element %d - not simple (%x)\n",
+				       c->name, i, sg[i].flag_count);
+				rcode = -EINVAL;
+				goto sg_list_cleanup;
+			}
+			sg_size = sg[i].flag_count & 0xffffff;
+			p = &(sg_list[sg_index]);
+			if (i2o_dma_alloc(&c->pdev->dev, p, sg_size)) {
+			/* Allocate memory for the transfer */
+				printk(KERN_DEBUG
+				       "%s: Could not allocate SG buffer - size = %d buffer number %d of %d\n",
+				       c->name, sg_size, i, sg_count);
+				rcode = -ENOMEM;
+				goto sg_list_cleanup;
+			}
+			sg_index++;
+			/* Copy in the user's SG buffer if necessary */
+			if (sg[i].
+			    flag_count & 0x04000000 /*I2O_SGL_FLAGS_DIR */ ) {
+				// TODO 64bit fix
+				if (copy_from_user
+				    (p->virt, (void __user *)sg[i].addr_bus,
+				     sg_size)) {
+					printk(KERN_DEBUG
+					       "%s: Could not copy SG buf %d FROM user\n",
+					       c->name, i);
+					rcode = -EFAULT;
+					goto sg_list_cleanup;
+				}
+			}
+			sg[i].addr_bus = p->phys;
+		}
+	}
+
+	rcode = i2o_msg_post_wait(c, msg, 60);
+	msg = NULL;
+	if (rcode) {
+		reply[4] = ((u32) rcode) << 24;
+		goto sg_list_cleanup;
+	}
+
+	if (sg_offset) {
+		u32 rmsg[I2O_OUTBOUND_MSG_FRAME_SIZE];
+		/* Copy back the Scatter Gather buffers back to user space */
+		u32 j;
+		// TODO 64bit fix
+		struct sg_simple_element *sg;
+		int sg_size;
+
+		// re-acquire the original message to handle correctly the sg copy operation
+		memset(&rmsg, 0, I2O_OUTBOUND_MSG_FRAME_SIZE * 4);
+		// get user msg size in u32s
+		if (get_user(size, &user_msg[0])) {
+			rcode = -EFAULT;
+			goto sg_list_cleanup;
+		}
+		size = size >> 16;
+		size *= 4;
+		if (size > sizeof(rmsg)) {
+			rcode = -EFAULT;
+			goto sg_list_cleanup;
+		}
+
+		/* Copy in the user's I2O command */
+		if (copy_from_user(rmsg, user_msg, size)) {
+			rcode = -EFAULT;
+			goto sg_list_cleanup;
+		}
+		sg_count =
+		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
+
+		// TODO 64bit fix
+		sg = (struct sg_simple_element *)(rmsg + sg_offset);
+		for (j = 0; j < sg_count; j++) {
+			/* Copy out the SG list to user's buffer if necessary */
+			if (!
+			    (sg[j].
+			     flag_count & 0x4000000 /*I2O_SGL_FLAGS_DIR */ )) {
+				sg_size = sg[j].flag_count & 0xffffff;
+				// TODO 64bit fix
+				if (copy_to_user
+				    ((void __user *)sg[j].addr_bus, sg_list[j].virt,
+				     sg_size)) {
+					printk(KERN_WARNING
+					       "%s: Could not copy %p TO user %x\n",
+					       c->name, sg_list[j].virt,
+					       sg[j].addr_bus);
+					rcode = -EFAULT;
+					goto sg_list_cleanup;
+				}
+			}
+		}
+	}
+
+sg_list_cleanup:
+	/* Copy back the reply to user space */
+	if (reply_size) {
+		// we wrote our own values for context - now restore the user supplied ones
+		if (copy_from_user(reply + 2, user_msg + 2, sizeof(u32) * 2)) {
+			printk(KERN_WARNING
+			       "%s: Could not copy message context FROM user\n",
+			       c->name);
+			rcode = -EFAULT;
+		}
+		if (copy_to_user(user_reply, reply, reply_size)) {
+			printk(KERN_WARNING
+			       "%s: Could not copy reply TO user\n", c->name);
+			rcode = -EFAULT;
+		}
+	}
+
+	for (i = 0; i < sg_index; i++)
+		i2o_dma_free(&c->pdev->dev, &sg_list[i]);
+
+cleanup:
+	kfree(reply);
+out:
+	if (msg)
+		i2o_msg_nop(c, msg);
+	return rcode;
+}
+#endif
+
+/*
+ * IOCTL Handler
+ */
+static long i2o_cfg_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	mutex_lock(&i2o_cfg_mutex);
+	switch (cmd) {
+	case I2OGETIOPS:
+		ret = i2o_cfg_getiops(arg);
+		break;
+
+	case I2OHRTGET:
+		ret = i2o_cfg_gethrt(arg);
+		break;
+
+	case I2OLCTGET:
+		ret = i2o_cfg_getlct(arg);
+		break;
+
+	case I2OPARMSET:
+		ret = i2o_cfg_parms(arg, I2OPARMSET);
+		break;
+
+	case I2OPARMGET:
+		ret = i2o_cfg_parms(arg, I2OPARMGET);
+		break;
+
+	case I2OSWDL:
+		ret = i2o_cfg_swdl(arg);
+		break;
+
+	case I2OSWUL:
+		ret = i2o_cfg_swul(arg);
+		break;
+
+	case I2OSWDEL:
+		ret = i2o_cfg_swdel(arg);
+		break;
+
+	case I2OVALIDATE:
+		ret = i2o_cfg_validate(arg);
+		break;
+
+	case I2OEVTREG:
+		ret = i2o_cfg_evt_reg(arg, fp);
+		break;
+
+	case I2OEVTGET:
+		ret = i2o_cfg_evt_get(arg, fp);
+		break;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	case I2OPASSTHRU:
+		ret = i2o_cfg_passthru(arg);
+		break;
+#endif
+
+	default:
+		osm_debug("unknown ioctl called!\n");
+		ret = -EINVAL;
+	}
+	mutex_unlock(&i2o_cfg_mutex);
+	return ret;
+}
+
+static int cfg_open(struct inode *inode, struct file *file)
+{
+	struct i2o_cfg_info *tmp = kmalloc(sizeof(struct i2o_cfg_info),
+					   GFP_KERNEL);
+	unsigned long flags;
+
+	if (!tmp)
+		return -ENOMEM;
+
+	mutex_lock(&i2o_cfg_mutex);
+	file->private_data = (void *)(i2o_cfg_info_id++);
+	tmp->fp = file;
+	tmp->fasync = NULL;
+	tmp->q_id = (ulong) file->private_data;
+	tmp->q_len = 0;
+	tmp->q_in = 0;
+	tmp->q_out = 0;
+	tmp->q_lost = 0;
+	tmp->next = open_files;
+
+	spin_lock_irqsave(&i2o_config_lock, flags);
+	open_files = tmp;
+	spin_unlock_irqrestore(&i2o_config_lock, flags);
+	mutex_unlock(&i2o_cfg_mutex);
+
+	return 0;
+}
+
+static int cfg_fasync(int fd, struct file *fp, int on)
+{
+	ulong id = (ulong) fp->private_data;
+	struct i2o_cfg_info *p;
+	int ret = -EBADF;
+
+	mutex_lock(&i2o_cfg_mutex);
+	for (p = open_files; p; p = p->next)
+		if (p->q_id == id)
+			break;
+
+	if (p)
+		ret = fasync_helper(fd, fp, on, &p->fasync);
+	mutex_unlock(&i2o_cfg_mutex);
+	return ret;
+}
+
+static int cfg_release(struct inode *inode, struct file *file)
+{
+	ulong id = (ulong) file->private_data;
+	struct i2o_cfg_info *p, **q;
+	unsigned long flags;
+
+	mutex_lock(&i2o_cfg_mutex);
+	spin_lock_irqsave(&i2o_config_lock, flags);
+	for (q = &open_files; (p = *q) != NULL; q = &p->next) {
+		if (p->q_id == id) {
+			*q = p->next;
+			kfree(p);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&i2o_config_lock, flags);
+	mutex_unlock(&i2o_cfg_mutex);
+
+	return 0;
+}
+
+static const struct file_operations config_fops = {
+	.owner = THIS_MODULE,
+	.llseek = no_llseek,
+	.unlocked_ioctl = i2o_cfg_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = i2o_cfg_compat_ioctl,
+#endif
+	.open = cfg_open,
+	.release = cfg_release,
+	.fasync = cfg_fasync,
+};
+
+static struct miscdevice i2o_miscdev = {
+	I2O_MINOR,
+	"i2octl",
+	&config_fops
+};
+
+static int __init i2o_config_old_init(void)
+{
+	spin_lock_init(&i2o_config_lock);
+
+	if (misc_register(&i2o_miscdev) < 0) {
+		osm_err("can't register device.\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void i2o_config_old_exit(void)
+{
+	misc_deregister(&i2o_miscdev);
+}
+
+MODULE_AUTHOR("Red Hat Software");
diff --git a/drivers/staging/i2o/i2o_proc.c b/drivers/staging/i2o/i2o_proc.c
new file mode 100644
index 000000000000..ad84f3304f3c
--- /dev/null
+++ b/drivers/staging/i2o/i2o_proc.c
@@ -0,0 +1,2045 @@
+/*
+ *	procfs handler for Linux I2O subsystem
+ *
+ *	(c) Copyright 1999	Deepak Saxena
+ *
+ *	Originally written by Deepak Saxena(deepak@plexity.net)
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	This is an initial test release. The code is based on the design of the
+ *	ide procfs system (drivers/block/ide-proc.c). Some code taken from
+ *	i2o-core module by Alan Cox.
+ *
+ *	DISCLAIMER: This code is still under development/test and may cause
+ *	your system to behave unpredictably.  Use at your own discretion.
+ *
+ *
+ *	Fixes/additions:
+ *		Juha Sievänen (Juha.Sievanen@cs.Helsinki.FI),
+ *		Auvo Häkkinen (Auvo.Hakkinen@cs.Helsinki.FI)
+ *		University of Helsinki, Department of Computer Science
+ *			LAN entries
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			Changes for new I2O API
+ */
+
+#define OSM_NAME	"proc-osm"
+#define OSM_VERSION	"1.316"
+#define OSM_DESCRIPTION	"I2O ProcFS OSM"
+
+#define I2O_MAX_MODULES 4
+// FIXME!
+#define FMT_U64_HEX "0x%08x%08x"
+#define U64_VAL(pu64) *((u32*)(pu64)+1), *((u32*)(pu64))
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "i2o.h"
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+
+/* Structure used to define /proc entries */
+typedef struct _i2o_proc_entry_t {
+	char *name;		/* entry name */
+	umode_t mode;		/* mode */
+	const struct file_operations *fops;	/* open function */
+} i2o_proc_entry;
+
+/* global I2O /proc/i2o entry */
+static struct proc_dir_entry *i2o_proc_dir_root;
+
+/* proc OSM driver struct */
+static struct i2o_driver i2o_proc_driver = {
+	.name = OSM_NAME,
+};
+
+static int print_serial_number(struct seq_file *seq, u8 * serialno, int max_len)
+{
+	int i;
+
+	/* 19990419 -sralston
+	 *      The I2O v1.5 (and v2.0 so far) "official specification"
+	 *      got serial numbers WRONG!
+	 *      Apparently, and despite what Section 3.4.4 says and
+	 *      Figure 3-35 shows (pg 3-39 in the pdf doc),
+	 *      the convention / consensus seems to be:
+	 *        + First byte is SNFormat
+	 *        + Second byte is SNLen (but only if SNFormat==7 (?))
+	 *        + (v2.0) SCSI+BS may use IEEE Registered (64 or 128 bit) format
+	 */
+	switch (serialno[0]) {
+	case I2O_SNFORMAT_BINARY:	/* Binary */
+		seq_printf(seq, "0x");
+		for (i = 0; i < serialno[1]; i++) {
+			seq_printf(seq, "%02X", serialno[2 + i]);
+		}
+		break;
+
+	case I2O_SNFORMAT_ASCII:	/* ASCII */
+		if (serialno[1] < ' ') {	/* printable or SNLen? */
+			/* sanity */
+			max_len =
+			    (max_len < serialno[1]) ? max_len : serialno[1];
+			serialno[1 + max_len] = '\0';
+
+			/* just print it */
+			seq_printf(seq, "%s", &serialno[2]);
+		} else {
+			/* print chars for specified length */
+			for (i = 0; i < serialno[1]; i++) {
+				seq_printf(seq, "%c", serialno[2 + i]);
+			}
+		}
+		break;
+
+	case I2O_SNFORMAT_UNICODE:	/* UNICODE */
+		seq_printf(seq, "UNICODE Format.  Can't Display\n");
+		break;
+
+	case I2O_SNFORMAT_LAN48_MAC:	/* LAN-48 MAC Address */
+		seq_printf(seq, "LAN-48 MAC address @ %pM", &serialno[2]);
+		break;
+
+	case I2O_SNFORMAT_WAN:	/* WAN MAC Address */
+		/* FIXME: Figure out what a WAN access address looks like?? */
+		seq_printf(seq, "WAN Access Address");
+		break;
+
+/* plus new in v2.0 */
+	case I2O_SNFORMAT_LAN64_MAC:	/* LAN-64 MAC Address */
+		/* FIXME: Figure out what a LAN-64 address really looks like?? */
+		seq_printf(seq,
+			   "LAN-64 MAC address @ [?:%02X:%02X:?] %pM",
+			   serialno[8], serialno[9], &serialno[2]);
+		break;
+
+	case I2O_SNFORMAT_DDM:	/* I2O DDM */
+		seq_printf(seq,
+			   "DDM: Tid=%03Xh, Rsvd=%04Xh, OrgId=%04Xh",
+			   *(u16 *) & serialno[2],
+			   *(u16 *) & serialno[4], *(u16 *) & serialno[6]);
+		break;
+
+	case I2O_SNFORMAT_IEEE_REG64:	/* IEEE Registered (64-bit) */
+	case I2O_SNFORMAT_IEEE_REG128:	/* IEEE Registered (128-bit) */
+		/* FIXME: Figure if this is even close?? */
+		seq_printf(seq,
+			   "IEEE NodeName(hi,lo)=(%08Xh:%08Xh), PortName(hi,lo)=(%08Xh:%08Xh)\n",
+			   *(u32 *) & serialno[2],
+			   *(u32 *) & serialno[6],
+			   *(u32 *) & serialno[10], *(u32 *) & serialno[14]);
+		break;
+
+	case I2O_SNFORMAT_UNKNOWN:	/* Unknown 0    */
+	case I2O_SNFORMAT_UNKNOWN2:	/* Unknown 0xff */
+	default:
+		seq_printf(seq, "Unknown data format (0x%02x)", serialno[0]);
+		break;
+	}
+
+	return 0;
+}
+
+/**
+ *	i2o_get_class_name - 	do i2o class name lookup
+ *	@class: class number
+ *
+ *	Return a descriptive string for an i2o class.
+ */
+static const char *i2o_get_class_name(int class)
+{
+	int idx = 16;
+	static char *i2o_class_name[] = {
+		"Executive",
+		"Device Driver Module",
+		"Block Device",
+		"Tape Device",
+		"LAN Interface",
+		"WAN Interface",
+		"Fibre Channel Port",
+		"Fibre Channel Device",
+		"SCSI Device",
+		"ATE Port",
+		"ATE Device",
+		"Floppy Controller",
+		"Floppy Device",
+		"Secondary Bus Port",
+		"Peer Transport Agent",
+		"Peer Transport",
+		"Unknown"
+	};
+
+	switch (class & 0xfff) {
+	case I2O_CLASS_EXECUTIVE:
+		idx = 0;
+		break;
+	case I2O_CLASS_DDM:
+		idx = 1;
+		break;
+	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+		idx = 2;
+		break;
+	case I2O_CLASS_SEQUENTIAL_STORAGE:
+		idx = 3;
+		break;
+	case I2O_CLASS_LAN:
+		idx = 4;
+		break;
+	case I2O_CLASS_WAN:
+		idx = 5;
+		break;
+	case I2O_CLASS_FIBRE_CHANNEL_PORT:
+		idx = 6;
+		break;
+	case I2O_CLASS_FIBRE_CHANNEL_PERIPHERAL:
+		idx = 7;
+		break;
+	case I2O_CLASS_SCSI_PERIPHERAL:
+		idx = 8;
+		break;
+	case I2O_CLASS_ATE_PORT:
+		idx = 9;
+		break;
+	case I2O_CLASS_ATE_PERIPHERAL:
+		idx = 10;
+		break;
+	case I2O_CLASS_FLOPPY_CONTROLLER:
+		idx = 11;
+		break;
+	case I2O_CLASS_FLOPPY_DEVICE:
+		idx = 12;
+		break;
+	case I2O_CLASS_BUS_ADAPTER:
+		idx = 13;
+		break;
+	case I2O_CLASS_PEER_TRANSPORT_AGENT:
+		idx = 14;
+		break;
+	case I2O_CLASS_PEER_TRANSPORT:
+		idx = 15;
+		break;
+	}
+
+	return i2o_class_name[idx];
+}
+
+#define SCSI_TABLE_SIZE	13
+static char *scsi_devices[] = {
+	"Direct-Access Read/Write",
+	"Sequential-Access Storage",
+	"Printer",
+	"Processor",
+	"WORM Device",
+	"CD-ROM Device",
+	"Scanner Device",
+	"Optical Memory Device",
+	"Medium Changer Device",
+	"Communications Device",
+	"Graphics Art Pre-Press Device",
+	"Graphics Art Pre-Press Device",
+	"Array Controller Device"
+};
+
+static char *chtostr(char *tmp, u8 *chars, int n)
+{
+	tmp[0] = 0;
+	return strncat(tmp, (char *)chars, n);
+}
+
+static int i2o_report_query_status(struct seq_file *seq, int block_status,
+				   char *group)
+{
+	switch (block_status) {
+	case -ETIMEDOUT:
+		return seq_printf(seq, "Timeout reading group %s.\n", group);
+	case -ENOMEM:
+		return seq_printf(seq, "No free memory to read the table.\n");
+	case -I2O_PARAMS_STATUS_INVALID_GROUP_ID:
+		return seq_printf(seq, "Group %s not supported.\n", group);
+	default:
+		return seq_printf(seq,
+				  "Error reading group %s. BlockStatus 0x%02X\n",
+				  group, -block_status);
+	}
+}
+
+static char *bus_strings[] = {
+	"Local Bus",
+	"ISA",
+	"EISA",
+	"PCI",
+	"PCMCIA",
+	"NUBUS",
+	"CARDBUS"
+};
+
+static int i2o_seq_show_hrt(struct seq_file *seq, void *v)
+{
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	i2o_hrt *hrt = (i2o_hrt *) c->hrt.virt;
+	u32 bus;
+	int i;
+
+	if (hrt->hrt_version) {
+		seq_printf(seq,
+			   "HRT table for controller is too new a version.\n");
+		return 0;
+	}
+
+	seq_printf(seq, "HRT has %d entries of %d bytes each.\n",
+		   hrt->num_entries, hrt->entry_len << 2);
+
+	for (i = 0; i < hrt->num_entries; i++) {
+		seq_printf(seq, "Entry %d:\n", i);
+		seq_printf(seq, "   Adapter ID: %0#10x\n",
+			   hrt->hrt_entry[i].adapter_id);
+		seq_printf(seq, "   Controlling tid: %0#6x\n",
+			   hrt->hrt_entry[i].parent_tid);
+
+		if (hrt->hrt_entry[i].bus_type != 0x80) {
+			bus = hrt->hrt_entry[i].bus_type;
+			seq_printf(seq, "   %s Information\n",
+				   bus_strings[bus]);
+
+			switch (bus) {
+			case I2O_BUS_LOCAL:
+				seq_printf(seq, "     IOBase: %0#6x,",
+					   hrt->hrt_entry[i].bus.local_bus.
+					   LbBaseIOPort);
+				seq_printf(seq, " MemoryBase: %0#10x\n",
+					   hrt->hrt_entry[i].bus.local_bus.
+					   LbBaseMemoryAddress);
+				break;
+
+			case I2O_BUS_ISA:
+				seq_printf(seq, "     IOBase: %0#6x,",
+					   hrt->hrt_entry[i].bus.isa_bus.
+					   IsaBaseIOPort);
+				seq_printf(seq, " MemoryBase: %0#10x,",
+					   hrt->hrt_entry[i].bus.isa_bus.
+					   IsaBaseMemoryAddress);
+				seq_printf(seq, " CSN: %0#4x,",
+					   hrt->hrt_entry[i].bus.isa_bus.CSN);
+				break;
+
+			case I2O_BUS_EISA:
+				seq_printf(seq, "     IOBase: %0#6x,",
+					   hrt->hrt_entry[i].bus.eisa_bus.
+					   EisaBaseIOPort);
+				seq_printf(seq, " MemoryBase: %0#10x,",
+					   hrt->hrt_entry[i].bus.eisa_bus.
+					   EisaBaseMemoryAddress);
+				seq_printf(seq, " Slot: %0#4x,",
+					   hrt->hrt_entry[i].bus.eisa_bus.
+					   EisaSlotNumber);
+				break;
+
+			case I2O_BUS_PCI:
+				seq_printf(seq, "     Bus: %0#4x",
+					   hrt->hrt_entry[i].bus.pci_bus.
+					   PciBusNumber);
+				seq_printf(seq, " Dev: %0#4x",
+					   hrt->hrt_entry[i].bus.pci_bus.
+					   PciDeviceNumber);
+				seq_printf(seq, " Func: %0#4x",
+					   hrt->hrt_entry[i].bus.pci_bus.
+					   PciFunctionNumber);
+				seq_printf(seq, " Vendor: %0#6x",
+					   hrt->hrt_entry[i].bus.pci_bus.
+					   PciVendorID);
+				seq_printf(seq, " Device: %0#6x\n",
+					   hrt->hrt_entry[i].bus.pci_bus.
+					   PciDeviceID);
+				break;
+
+			default:
+				seq_printf(seq, "      Unsupported Bus Type\n");
+			}
+		} else
+			seq_printf(seq, "   Unknown Bus Type\n");
+	}
+
+	return 0;
+}
+
+static int i2o_seq_show_lct(struct seq_file *seq, void *v)
+{
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	i2o_lct *lct = (i2o_lct *) c->lct;
+	int entries;
+	int i;
+
+#define BUS_TABLE_SIZE 3
+	static char *bus_ports[] = {
+		"Generic Bus",
+		"SCSI Bus",
+		"Fibre Channel Bus"
+	};
+
+	entries = (lct->table_size - 3) / 9;
+
+	seq_printf(seq, "LCT contains %d %s\n", entries,
+		   entries == 1 ? "entry" : "entries");
+	if (lct->boot_tid)
+		seq_printf(seq, "Boot Device @ ID %d\n", lct->boot_tid);
+
+	seq_printf(seq, "Current Change Indicator: %#10x\n", lct->change_ind);
+
+	for (i = 0; i < entries; i++) {
+		seq_printf(seq, "Entry %d\n", i);
+		seq_printf(seq, "  Class, SubClass  : %s",
+			   i2o_get_class_name(lct->lct_entry[i].class_id));
+
+		/*
+		 *      Classes which we'll print subclass info for
+		 */
+		switch (lct->lct_entry[i].class_id & 0xFFF) {
+		case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+			switch (lct->lct_entry[i].sub_class) {
+			case 0x00:
+				seq_printf(seq, ", Direct-Access Read/Write");
+				break;
+
+			case 0x04:
+				seq_printf(seq, ", WORM Drive");
+				break;
+
+			case 0x05:
+				seq_printf(seq, ", CD-ROM Drive");
+				break;
+
+			case 0x07:
+				seq_printf(seq, ", Optical Memory Device");
+				break;
+
+			default:
+				seq_printf(seq, ", Unknown (0x%02x)",
+					   lct->lct_entry[i].sub_class);
+				break;
+			}
+			break;
+
+		case I2O_CLASS_LAN:
+			switch (lct->lct_entry[i].sub_class & 0xFF) {
+			case 0x30:
+				seq_printf(seq, ", Ethernet");
+				break;
+
+			case 0x40:
+				seq_printf(seq, ", 100base VG");
+				break;
+
+			case 0x50:
+				seq_printf(seq, ", IEEE 802.5/Token-Ring");
+				break;
+
+			case 0x60:
+				seq_printf(seq, ", ANSI X3T9.5 FDDI");
+				break;
+
+			case 0x70:
+				seq_printf(seq, ", Fibre Channel");
+				break;
+
+			default:
+				seq_printf(seq, ", Unknown Sub-Class (0x%02x)",
+					   lct->lct_entry[i].sub_class & 0xFF);
+				break;
+			}
+			break;
+
+		case I2O_CLASS_SCSI_PERIPHERAL:
+			if (lct->lct_entry[i].sub_class < SCSI_TABLE_SIZE)
+				seq_printf(seq, ", %s",
+					   scsi_devices[lct->lct_entry[i].
+							sub_class]);
+			else
+				seq_printf(seq, ", Unknown Device Type");
+			break;
+
+		case I2O_CLASS_BUS_ADAPTER:
+			if (lct->lct_entry[i].sub_class < BUS_TABLE_SIZE)
+				seq_printf(seq, ", %s",
+					   bus_ports[lct->lct_entry[i].
+						     sub_class]);
+			else
+				seq_printf(seq, ", Unknown Bus Type");
+			break;
+		}
+		seq_printf(seq, "\n");
+
+		seq_printf(seq, "  Local TID        : 0x%03x\n",
+			   lct->lct_entry[i].tid);
+		seq_printf(seq, "  User TID         : 0x%03x\n",
+			   lct->lct_entry[i].user_tid);
+		seq_printf(seq, "  Parent TID       : 0x%03x\n",
+			   lct->lct_entry[i].parent_tid);
+		seq_printf(seq, "  Identity Tag     : 0x%x%x%x%x%x%x%x%x\n",
+			   lct->lct_entry[i].identity_tag[0],
+			   lct->lct_entry[i].identity_tag[1],
+			   lct->lct_entry[i].identity_tag[2],
+			   lct->lct_entry[i].identity_tag[3],
+			   lct->lct_entry[i].identity_tag[4],
+			   lct->lct_entry[i].identity_tag[5],
+			   lct->lct_entry[i].identity_tag[6],
+			   lct->lct_entry[i].identity_tag[7]);
+		seq_printf(seq, "  Change Indicator : %0#10x\n",
+			   lct->lct_entry[i].change_ind);
+		seq_printf(seq, "  Event Capab Mask : %0#10x\n",
+			   lct->lct_entry[i].device_flags);
+	}
+
+	return 0;
+}
+
+static int i2o_seq_show_status(struct seq_file *seq, void *v)
+{
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	char prodstr[25];
+	int version;
+	i2o_status_block *sb = c->status_block.virt;
+
+	i2o_status_get(c);	// reread the status block
+
+	seq_printf(seq, "Organization ID        : %0#6x\n", sb->org_id);
+
+	version = sb->i2o_version;
+
+/* FIXME for Spec 2.0
+	if (version == 0x02) {
+		seq_printf(seq, "Lowest I2O version supported: ");
+		switch(workspace[2]) {
+			case 0x00:
+				seq_printf(seq, "1.0\n");
+				break;
+			case 0x01:
+				seq_printf(seq, "1.5\n");
+				break;
+			case 0x02:
+				seq_printf(seq, "2.0\n");
+				break;
+		}
+
+		seq_printf(seq, "Highest I2O version supported: ");
+		switch(workspace[3]) {
+			case 0x00:
+				seq_printf(seq, "1.0\n");
+				break;
+			case 0x01:
+				seq_printf(seq, "1.5\n");
+				break;
+			case 0x02:
+				seq_printf(seq, "2.0\n");
+				break;
+		}
+	}
+*/
+	seq_printf(seq, "IOP ID                 : %0#5x\n", sb->iop_id);
+	seq_printf(seq, "Host Unit ID           : %0#6x\n", sb->host_unit_id);
+	seq_printf(seq, "Segment Number         : %0#5x\n", sb->segment_number);
+
+	seq_printf(seq, "I2O version            : ");
+	switch (version) {
+	case 0x00:
+		seq_printf(seq, "1.0\n");
+		break;
+	case 0x01:
+		seq_printf(seq, "1.5\n");
+		break;
+	case 0x02:
+		seq_printf(seq, "2.0\n");
+		break;
+	default:
+		seq_printf(seq, "Unknown version\n");
+	}
+
+	seq_printf(seq, "IOP State              : ");
+	switch (sb->iop_state) {
+	case 0x01:
+		seq_printf(seq, "INIT\n");
+		break;
+
+	case 0x02:
+		seq_printf(seq, "RESET\n");
+		break;
+
+	case 0x04:
+		seq_printf(seq, "HOLD\n");
+		break;
+
+	case 0x05:
+		seq_printf(seq, "READY\n");
+		break;
+
+	case 0x08:
+		seq_printf(seq, "OPERATIONAL\n");
+		break;
+
+	case 0x10:
+		seq_printf(seq, "FAILED\n");
+		break;
+
+	case 0x11:
+		seq_printf(seq, "FAULTED\n");
+		break;
+
+	default:
+		seq_printf(seq, "Unknown\n");
+		break;
+	}
+
+	seq_printf(seq, "Messenger Type         : ");
+	switch (sb->msg_type) {
+	case 0x00:
+		seq_printf(seq, "Memory mapped\n");
+		break;
+	case 0x01:
+		seq_printf(seq, "Memory mapped only\n");
+		break;
+	case 0x02:
+		seq_printf(seq, "Remote only\n");
+		break;
+	case 0x03:
+		seq_printf(seq, "Memory mapped and remote\n");
+		break;
+	default:
+		seq_printf(seq, "Unknown\n");
+	}
+
+	seq_printf(seq, "Inbound Frame Size     : %d bytes\n",
+		   sb->inbound_frame_size << 2);
+	seq_printf(seq, "Max Inbound Frames     : %d\n",
+		   sb->max_inbound_frames);
+	seq_printf(seq, "Current Inbound Frames : %d\n",
+		   sb->cur_inbound_frames);
+	seq_printf(seq, "Max Outbound Frames    : %d\n",
+		   sb->max_outbound_frames);
+
+	/* Spec doesn't say if NULL terminated or not... */
+	memcpy(prodstr, sb->product_id, 24);
+	prodstr[24] = '\0';
+	seq_printf(seq, "Product ID             : %s\n", prodstr);
+	seq_printf(seq, "Expected LCT Size      : %d bytes\n",
+		   sb->expected_lct_size);
+
+	seq_printf(seq, "IOP Capabilities\n");
+	seq_printf(seq, "    Context Field Size Support : ");
+	switch (sb->iop_capabilities & 0x0000003) {
+	case 0:
+		seq_printf(seq, "Supports only 32-bit context fields\n");
+		break;
+	case 1:
+		seq_printf(seq, "Supports only 64-bit context fields\n");
+		break;
+	case 2:
+		seq_printf(seq, "Supports 32-bit and 64-bit context fields, "
+			   "but not concurrently\n");
+		break;
+	case 3:
+		seq_printf(seq, "Supports 32-bit and 64-bit context fields "
+			   "concurrently\n");
+		break;
+	default:
+		seq_printf(seq, "0x%08x\n", sb->iop_capabilities);
+	}
+	seq_printf(seq, "    Current Context Field Size : ");
+	switch (sb->iop_capabilities & 0x0000000C) {
+	case 0:
+		seq_printf(seq, "not configured\n");
+		break;
+	case 4:
+		seq_printf(seq, "Supports only 32-bit context fields\n");
+		break;
+	case 8:
+		seq_printf(seq, "Supports only 64-bit context fields\n");
+		break;
+	case 12:
+		seq_printf(seq, "Supports both 32-bit or 64-bit context fields "
+			   "concurrently\n");
+		break;
+	default:
+		seq_printf(seq, "\n");
+	}
+	seq_printf(seq, "    Inbound Peer Support       : %s\n",
+		   (sb->
+		    iop_capabilities & 0x00000010) ? "Supported" :
+		   "Not supported");
+	seq_printf(seq, "    Outbound Peer Support      : %s\n",
+		   (sb->
+		    iop_capabilities & 0x00000020) ? "Supported" :
+		   "Not supported");
+	seq_printf(seq, "    Peer to Peer Support       : %s\n",
+		   (sb->
+		    iop_capabilities & 0x00000040) ? "Supported" :
+		   "Not supported");
+
+	seq_printf(seq, "Desired private memory size   : %d kB\n",
+		   sb->desired_mem_size >> 10);
+	seq_printf(seq, "Allocated private memory size : %d kB\n",
+		   sb->current_mem_size >> 10);
+	seq_printf(seq, "Private memory base address   : %0#10x\n",
+		   sb->current_mem_base);
+	seq_printf(seq, "Desired private I/O size      : %d kB\n",
+		   sb->desired_io_size >> 10);
+	seq_printf(seq, "Allocated private I/O size    : %d kB\n",
+		   sb->current_io_size >> 10);
+	seq_printf(seq, "Private I/O base address      : %0#10x\n",
+		   sb->current_io_base);
+
+	return 0;
+}
+
+static int i2o_seq_show_hw(struct seq_file *seq, void *v)
+{
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	static u32 work32[5];
+	static u8 *work8 = (u8 *) work32;
+	static u16 *work16 = (u16 *) work32;
+	int token;
+	u32 hwcap;
+
+	static char *cpu_table[] = {
+		"Intel 80960 series",
+		"AMD2900 series",
+		"Motorola 68000 series",
+		"ARM series",
+		"MIPS series",
+		"Sparc series",
+		"PowerPC series",
+		"Intel x86 series"
+	};
+
+	token =
+	    i2o_parm_field_get(c->exec, 0x0000, -1, &work32, sizeof(work32));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0x0000 IOP Hardware");
+		return 0;
+	}
+
+	seq_printf(seq, "I2O Vendor ID    : %0#6x\n", work16[0]);
+	seq_printf(seq, "Product ID       : %0#6x\n", work16[1]);
+	seq_printf(seq, "CPU              : ");
+	if (work8[16] > 8)
+		seq_printf(seq, "Unknown\n");
+	else
+		seq_printf(seq, "%s\n", cpu_table[work8[16]]);
+	/* Anyone using ProcessorVersion? */
+
+	seq_printf(seq, "RAM              : %dkB\n", work32[1] >> 10);
+	seq_printf(seq, "Non-Volatile Mem : %dkB\n", work32[2] >> 10);
+
+	hwcap = work32[3];
+	seq_printf(seq, "Capabilities : 0x%08x\n", hwcap);
+	seq_printf(seq, "   [%s] Self booting\n",
+		   (hwcap & 0x00000001) ? "+" : "-");
+	seq_printf(seq, "   [%s] Upgradable IRTOS\n",
+		   (hwcap & 0x00000002) ? "+" : "-");
+	seq_printf(seq, "   [%s] Supports downloading DDMs\n",
+		   (hwcap & 0x00000004) ? "+" : "-");
+	seq_printf(seq, "   [%s] Supports installing DDMs\n",
+		   (hwcap & 0x00000008) ? "+" : "-");
+	seq_printf(seq, "   [%s] Battery-backed RAM\n",
+		   (hwcap & 0x00000010) ? "+" : "-");
+
+	return 0;
+}
+
+/* Executive group 0003h - Executing DDM List (table) */
+static int i2o_seq_show_ddm_table(struct seq_file *seq, void *v)
+{
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	int token;
+	int i;
+
+	typedef struct _i2o_exec_execute_ddm_table {
+		u16 ddm_tid;
+		u8 module_type;
+		u8 reserved;
+		u16 i2o_vendor_id;
+		u16 module_id;
+		u8 module_name_version[28];
+		u32 data_size;
+		u32 code_size;
+	} i2o_exec_execute_ddm_table;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		i2o_exec_execute_ddm_table ddm_table[I2O_MAX_MODULES];
+	} *result;
+
+	i2o_exec_execute_ddm_table ddm_table;
+	char tmp[28 + 1];
+
+	result = kmalloc(sizeof(*result), GFP_KERNEL);
+	if (!result)
+		return -ENOMEM;
+
+	token = i2o_parm_table_get(c->exec, I2O_PARAMS_TABLE_GET, 0x0003, -1,
+				   NULL, 0, result, sizeof(*result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0x0003 Executing DDM List");
+		goto out;
+	}
+
+	seq_printf(seq,
+		   "Tid   Module_type     Vendor Mod_id  Module_name             Vrs  Data_size Code_size\n");
+	ddm_table = result->ddm_table[0];
+
+	for (i = 0; i < result->row_count; ddm_table = result->ddm_table[++i]) {
+		seq_printf(seq, "0x%03x ", ddm_table.ddm_tid & 0xFFF);
+
+		switch (ddm_table.module_type) {
+		case 0x01:
+			seq_printf(seq, "Downloaded DDM  ");
+			break;
+		case 0x22:
+			seq_printf(seq, "Embedded DDM    ");
+			break;
+		default:
+			seq_printf(seq, "                ");
+		}
+
+		seq_printf(seq, "%-#7x", ddm_table.i2o_vendor_id);
+		seq_printf(seq, "%-#8x", ddm_table.module_id);
+		seq_printf(seq, "%-29s",
+			   chtostr(tmp, ddm_table.module_name_version, 28));
+		seq_printf(seq, "%9d  ", ddm_table.data_size);
+		seq_printf(seq, "%8d", ddm_table.code_size);
+
+		seq_printf(seq, "\n");
+	}
+      out:
+	kfree(result);
+	return 0;
+}
+
+/* Executive group 0004h - Driver Store (scalar) */
+static int i2o_seq_show_driver_store(struct seq_file *seq, void *v)
+{
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	u32 work32[8];
+	int token;
+
+	token =
+	    i2o_parm_field_get(c->exec, 0x0004, -1, &work32, sizeof(work32));
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0x0004 Driver Store");
+		return 0;
+	}
+
+	seq_printf(seq, "Module limit  : %d\n"
+		   "Module count  : %d\n"
+		   "Current space : %d kB\n"
+		   "Free space    : %d kB\n",
+		   work32[0], work32[1], work32[2] >> 10, work32[3] >> 10);
+
+	return 0;
+}
+
+/* Executive group 0005h - Driver Store Table (table) */
+static int i2o_seq_show_drivers_stored(struct seq_file *seq, void *v)
+{
+	typedef struct _i2o_driver_store {
+		u16 stored_ddm_index;
+		u8 module_type;
+		u8 reserved;
+		u16 i2o_vendor_id;
+		u16 module_id;
+		u8 module_name_version[28];
+		u8 date[8];
+		u32 module_size;
+		u32 mpb_size;
+		u32 module_flags;
+	} i2o_driver_store_table;
+
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	int token;
+	int i;
+
+	typedef struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		i2o_driver_store_table dst[I2O_MAX_MODULES];
+	} i2o_driver_result_table;
+
+	i2o_driver_result_table *result;
+	i2o_driver_store_table *dst;
+	char tmp[28 + 1];
+
+	result = kmalloc(sizeof(i2o_driver_result_table), GFP_KERNEL);
+	if (result == NULL)
+		return -ENOMEM;
+
+	token = i2o_parm_table_get(c->exec, I2O_PARAMS_TABLE_GET, 0x0005, -1,
+				   NULL, 0, result, sizeof(*result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0x0005 DRIVER STORE TABLE");
+		kfree(result);
+		return 0;
+	}
+
+	seq_printf(seq,
+		   "#  Module_type     Vendor Mod_id  Module_name             Vrs"
+		   "Date     Mod_size Par_size Flags\n");
+	for (i = 0, dst = &result->dst[0]; i < result->row_count;
+	     dst = &result->dst[++i]) {
+		seq_printf(seq, "%-3d", dst->stored_ddm_index);
+		switch (dst->module_type) {
+		case 0x01:
+			seq_printf(seq, "Downloaded DDM  ");
+			break;
+		case 0x22:
+			seq_printf(seq, "Embedded DDM    ");
+			break;
+		default:
+			seq_printf(seq, "                ");
+		}
+
+		seq_printf(seq, "%-#7x", dst->i2o_vendor_id);
+		seq_printf(seq, "%-#8x", dst->module_id);
+		seq_printf(seq, "%-29s",
+			   chtostr(tmp, dst->module_name_version, 28));
+		seq_printf(seq, "%-9s", chtostr(tmp, dst->date, 8));
+		seq_printf(seq, "%8d ", dst->module_size);
+		seq_printf(seq, "%8d ", dst->mpb_size);
+		seq_printf(seq, "0x%04x", dst->module_flags);
+		seq_printf(seq, "\n");
+	}
+
+	kfree(result);
+	return 0;
+}
+
+/* Generic group F000h - Params Descriptor (table) */
+static int i2o_seq_show_groups(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+	int i;
+	u8 properties;
+
+	typedef struct _i2o_group_info {
+		u16 group_number;
+		u16 field_count;
+		u16 row_count;
+		u8 properties;
+		u8 reserved;
+	} i2o_group_info;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		i2o_group_info group[256];
+	} *result;
+
+	result = kmalloc(sizeof(*result), GFP_KERNEL);
+	if (!result)
+		return -ENOMEM;
+
+	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF000, -1, NULL, 0,
+				   result, sizeof(*result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0xF000 Params Descriptor");
+		goto out;
+	}
+
+	seq_printf(seq,
+		   "#  Group   FieldCount RowCount Type   Add Del Clear\n");
+
+	for (i = 0; i < result->row_count; i++) {
+		seq_printf(seq, "%-3d", i);
+		seq_printf(seq, "0x%04X ", result->group[i].group_number);
+		seq_printf(seq, "%10d ", result->group[i].field_count);
+		seq_printf(seq, "%8d ", result->group[i].row_count);
+
+		properties = result->group[i].properties;
+		if (properties & 0x1)
+			seq_printf(seq, "Table  ");
+		else
+			seq_printf(seq, "Scalar ");
+		if (properties & 0x2)
+			seq_printf(seq, " + ");
+		else
+			seq_printf(seq, " - ");
+		if (properties & 0x4)
+			seq_printf(seq, "  + ");
+		else
+			seq_printf(seq, "  - ");
+		if (properties & 0x8)
+			seq_printf(seq, "  + ");
+		else
+			seq_printf(seq, "  - ");
+
+		seq_printf(seq, "\n");
+	}
+
+	if (result->more_flag)
+		seq_printf(seq, "There is more...\n");
+      out:
+	kfree(result);
+	return 0;
+}
+
+/* Generic group F001h - Physical Device Table (table) */
+static int i2o_seq_show_phys_device(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+	int i;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		u32 adapter_id[64];
+	} result;
+
+	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF001, -1, NULL, 0,
+				   &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0xF001 Physical Device Table");
+		return 0;
+	}
+
+	if (result.row_count)
+		seq_printf(seq, "#  AdapterId\n");
+
+	for (i = 0; i < result.row_count; i++) {
+		seq_printf(seq, "%-2d", i);
+		seq_printf(seq, "%#7x\n", result.adapter_id[i]);
+	}
+
+	if (result.more_flag)
+		seq_printf(seq, "There is more...\n");
+
+	return 0;
+}
+
+/* Generic group F002h - Claimed Table (table) */
+static int i2o_seq_show_claimed(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+	int i;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		u16 claimed_tid[64];
+	} result;
+
+	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF002, -1, NULL, 0,
+				   &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0xF002 Claimed Table");
+		return 0;
+	}
+
+	if (result.row_count)
+		seq_printf(seq, "#  ClaimedTid\n");
+
+	for (i = 0; i < result.row_count; i++) {
+		seq_printf(seq, "%-2d", i);
+		seq_printf(seq, "%#7x\n", result.claimed_tid[i]);
+	}
+
+	if (result.more_flag)
+		seq_printf(seq, "There is more...\n");
+
+	return 0;
+}
+
+/* Generic group F003h - User Table (table) */
+static int i2o_seq_show_users(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+	int i;
+
+	typedef struct _i2o_user_table {
+		u16 instance;
+		u16 user_tid;
+		u8 claim_type;
+		u8 reserved1;
+		u16 reserved2;
+	} i2o_user_table;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		i2o_user_table user[64];
+	} *result;
+
+	result = kmalloc(sizeof(*result), GFP_KERNEL);
+	if (!result)
+		return -ENOMEM;
+
+	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF003, -1, NULL, 0,
+				   result, sizeof(*result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0xF003 User Table");
+		goto out;
+	}
+
+	seq_printf(seq, "#  Instance UserTid ClaimType\n");
+
+	for (i = 0; i < result->row_count; i++) {
+		seq_printf(seq, "%-3d", i);
+		seq_printf(seq, "%#8x ", result->user[i].instance);
+		seq_printf(seq, "%#7x ", result->user[i].user_tid);
+		seq_printf(seq, "%#9x\n", result->user[i].claim_type);
+	}
+
+	if (result->more_flag)
+		seq_printf(seq, "There is more...\n");
+      out:
+	kfree(result);
+	return 0;
+}
+
+/* Generic group F005h - Private message extensions (table) (optional) */
+static int i2o_seq_show_priv_msgs(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+	int i;
+
+	typedef struct _i2o_private {
+		u16 ext_instance;
+		u16 organization_id;
+		u16 x_function_code;
+	} i2o_private;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		i2o_private extension[64];
+	} result;
+
+	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF000, -1, NULL, 0,
+				   &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0xF005 Private Message Extensions (optional)");
+		return 0;
+	}
+
+	seq_printf(seq, "Instance#  OrgId  FunctionCode\n");
+
+	for (i = 0; i < result.row_count; i++) {
+		seq_printf(seq, "%0#9x ", result.extension[i].ext_instance);
+		seq_printf(seq, "%0#6x ", result.extension[i].organization_id);
+		seq_printf(seq, "%0#6x", result.extension[i].x_function_code);
+
+		seq_printf(seq, "\n");
+	}
+
+	if (result.more_flag)
+		seq_printf(seq, "There is more...\n");
+
+	return 0;
+}
+
+/* Generic group F006h - Authorized User Table (table) */
+static int i2o_seq_show_authorized_users(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+	int i;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		u32 alternate_tid[64];
+	} result;
+
+	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF006, -1, NULL, 0,
+				   &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0xF006 Autohorized User Table");
+		return 0;
+	}
+
+	if (result.row_count)
+		seq_printf(seq, "#  AlternateTid\n");
+
+	for (i = 0; i < result.row_count; i++) {
+		seq_printf(seq, "%-2d", i);
+		seq_printf(seq, "%#7x ", result.alternate_tid[i]);
+	}
+
+	if (result.more_flag)
+		seq_printf(seq, "There is more...\n");
+
+	return 0;
+}
+
+/* Generic group F100h - Device Identity (scalar) */
+static int i2o_seq_show_dev_identity(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	static u32 work32[128];	// allow for "stuff" + up to 256 byte (max) serial number
+	// == (allow) 512d bytes (max)
+	static u16 *work16 = (u16 *) work32;
+	int token;
+	char tmp[16 + 1];
+
+	token = i2o_parm_field_get(d, 0xF100, -1, &work32, sizeof(work32));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0xF100 Device Identity");
+		return 0;
+	}
+
+	seq_printf(seq, "Device Class  : %s\n", i2o_get_class_name(work16[0]));
+	seq_printf(seq, "Owner TID     : %0#5x\n", work16[2]);
+	seq_printf(seq, "Parent TID    : %0#5x\n", work16[3]);
+	seq_printf(seq, "Vendor info   : %s\n",
+		   chtostr(tmp, (u8 *) (work32 + 2), 16));
+	seq_printf(seq, "Product info  : %s\n",
+		   chtostr(tmp, (u8 *) (work32 + 6), 16));
+	seq_printf(seq, "Description   : %s\n",
+		   chtostr(tmp, (u8 *) (work32 + 10), 16));
+	seq_printf(seq, "Product rev.  : %s\n",
+		   chtostr(tmp, (u8 *) (work32 + 14), 8));
+
+	seq_printf(seq, "Serial number : ");
+	print_serial_number(seq, (u8 *) (work32 + 16),
+			    /* allow for SNLen plus
+			     * possible trailing '\0'
+			     */
+			    sizeof(work32) - (16 * sizeof(u32)) - 2);
+	seq_printf(seq, "\n");
+
+	return 0;
+}
+
+static int i2o_seq_show_dev_name(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+
+	seq_printf(seq, "%s\n", dev_name(&d->device));
+
+	return 0;
+}
+
+/* Generic group F101h - DDM Identity (scalar) */
+static int i2o_seq_show_ddm_identity(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+
+	struct {
+		u16 ddm_tid;
+		u8 module_name[24];
+		u8 module_rev[8];
+		u8 sn_format;
+		u8 serial_number[12];
+		u8 pad[256];	// allow up to 256 byte (max) serial number
+	} result;
+
+	char tmp[24 + 1];
+
+	token = i2o_parm_field_get(d, 0xF101, -1, &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0xF101 DDM Identity");
+		return 0;
+	}
+
+	seq_printf(seq, "Registering DDM TID : 0x%03x\n", result.ddm_tid);
+	seq_printf(seq, "Module name         : %s\n",
+		   chtostr(tmp, result.module_name, 24));
+	seq_printf(seq, "Module revision     : %s\n",
+		   chtostr(tmp, result.module_rev, 8));
+
+	seq_printf(seq, "Serial number       : ");
+	print_serial_number(seq, result.serial_number, sizeof(result) - 36);
+	/* allow for SNLen plus possible trailing '\0' */
+
+	seq_printf(seq, "\n");
+
+	return 0;
+}
+
+/* Generic group F102h - User Information (scalar) */
+static int i2o_seq_show_uinfo(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+
+	struct {
+		u8 device_name[64];
+		u8 service_name[64];
+		u8 physical_location[64];
+		u8 instance_number[4];
+	} result;
+
+	char tmp[64 + 1];
+
+	token = i2o_parm_field_get(d, 0xF102, -1, &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0xF102 User Information");
+		return 0;
+	}
+
+	seq_printf(seq, "Device name     : %s\n",
+		   chtostr(tmp, result.device_name, 64));
+	seq_printf(seq, "Service name    : %s\n",
+		   chtostr(tmp, result.service_name, 64));
+	seq_printf(seq, "Physical name   : %s\n",
+		   chtostr(tmp, result.physical_location, 64));
+	seq_printf(seq, "Instance number : %s\n",
+		   chtostr(tmp, result.instance_number, 4));
+
+	return 0;
+}
+
+/* Generic group F103h - SGL Operating Limits (scalar) */
+static int i2o_seq_show_sgl_limits(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	static u32 work32[12];
+	static u16 *work16 = (u16 *) work32;
+	static u8 *work8 = (u8 *) work32;
+	int token;
+
+	token = i2o_parm_field_get(d, 0xF103, -1, &work32, sizeof(work32));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0xF103 SGL Operating Limits");
+		return 0;
+	}
+
+	seq_printf(seq, "SGL chain size        : %d\n", work32[0]);
+	seq_printf(seq, "Max SGL chain size    : %d\n", work32[1]);
+	seq_printf(seq, "SGL chain size target : %d\n", work32[2]);
+	seq_printf(seq, "SGL frag count        : %d\n", work16[6]);
+	seq_printf(seq, "Max SGL frag count    : %d\n", work16[7]);
+	seq_printf(seq, "SGL frag count target : %d\n", work16[8]);
+
+/* FIXME
+	if (d->i2oversion == 0x02)
+	{
+*/
+	seq_printf(seq, "SGL data alignment    : %d\n", work16[8]);
+	seq_printf(seq, "SGL addr limit        : %d\n", work8[20]);
+	seq_printf(seq, "SGL addr sizes supported : ");
+	if (work8[21] & 0x01)
+		seq_printf(seq, "32 bit ");
+	if (work8[21] & 0x02)
+		seq_printf(seq, "64 bit ");
+	if (work8[21] & 0x04)
+		seq_printf(seq, "96 bit ");
+	if (work8[21] & 0x08)
+		seq_printf(seq, "128 bit ");
+	seq_printf(seq, "\n");
+/*
+	}
+*/
+
+	return 0;
+}
+
+/* Generic group F200h - Sensors (scalar) */
+static int i2o_seq_show_sensors(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+
+	struct {
+		u16 sensor_instance;
+		u8 component;
+		u16 component_instance;
+		u8 sensor_class;
+		u8 sensor_type;
+		u8 scaling_exponent;
+		u32 actual_reading;
+		u32 minimum_reading;
+		u32 low2lowcat_treshold;
+		u32 lowcat2low_treshold;
+		u32 lowwarn2low_treshold;
+		u32 low2lowwarn_treshold;
+		u32 norm2lowwarn_treshold;
+		u32 lowwarn2norm_treshold;
+		u32 nominal_reading;
+		u32 hiwarn2norm_treshold;
+		u32 norm2hiwarn_treshold;
+		u32 high2hiwarn_treshold;
+		u32 hiwarn2high_treshold;
+		u32 hicat2high_treshold;
+		u32 hi2hicat_treshold;
+		u32 maximum_reading;
+		u8 sensor_state;
+		u16 event_enable;
+	} result;
+
+	token = i2o_parm_field_get(d, 0xF200, -1, &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0xF200 Sensors (optional)");
+		return 0;
+	}
+
+	seq_printf(seq, "Sensor instance       : %d\n", result.sensor_instance);
+
+	seq_printf(seq, "Component             : %d = ", result.component);
+	switch (result.component) {
+	case 0:
+		seq_printf(seq, "Other");
+		break;
+	case 1:
+		seq_printf(seq, "Planar logic Board");
+		break;
+	case 2:
+		seq_printf(seq, "CPU");
+		break;
+	case 3:
+		seq_printf(seq, "Chassis");
+		break;
+	case 4:
+		seq_printf(seq, "Power Supply");
+		break;
+	case 5:
+		seq_printf(seq, "Storage");
+		break;
+	case 6:
+		seq_printf(seq, "External");
+		break;
+	}
+	seq_printf(seq, "\n");
+
+	seq_printf(seq, "Component instance    : %d\n",
+		   result.component_instance);
+	seq_printf(seq, "Sensor class          : %s\n",
+		   result.sensor_class ? "Analog" : "Digital");
+
+	seq_printf(seq, "Sensor type           : %d = ", result.sensor_type);
+	switch (result.sensor_type) {
+	case 0:
+		seq_printf(seq, "Other\n");
+		break;
+	case 1:
+		seq_printf(seq, "Thermal\n");
+		break;
+	case 2:
+		seq_printf(seq, "DC voltage (DC volts)\n");
+		break;
+	case 3:
+		seq_printf(seq, "AC voltage (AC volts)\n");
+		break;
+	case 4:
+		seq_printf(seq, "DC current (DC amps)\n");
+		break;
+	case 5:
+		seq_printf(seq, "AC current (AC volts)\n");
+		break;
+	case 6:
+		seq_printf(seq, "Door open\n");
+		break;
+	case 7:
+		seq_printf(seq, "Fan operational\n");
+		break;
+	}
+
+	seq_printf(seq, "Scaling exponent      : %d\n",
+		   result.scaling_exponent);
+	seq_printf(seq, "Actual reading        : %d\n", result.actual_reading);
+	seq_printf(seq, "Minimum reading       : %d\n", result.minimum_reading);
+	seq_printf(seq, "Low2LowCat treshold   : %d\n",
+		   result.low2lowcat_treshold);
+	seq_printf(seq, "LowCat2Low treshold   : %d\n",
+		   result.lowcat2low_treshold);
+	seq_printf(seq, "LowWarn2Low treshold  : %d\n",
+		   result.lowwarn2low_treshold);
+	seq_printf(seq, "Low2LowWarn treshold  : %d\n",
+		   result.low2lowwarn_treshold);
+	seq_printf(seq, "Norm2LowWarn treshold : %d\n",
+		   result.norm2lowwarn_treshold);
+	seq_printf(seq, "LowWarn2Norm treshold : %d\n",
+		   result.lowwarn2norm_treshold);
+	seq_printf(seq, "Nominal reading       : %d\n", result.nominal_reading);
+	seq_printf(seq, "HiWarn2Norm treshold  : %d\n",
+		   result.hiwarn2norm_treshold);
+	seq_printf(seq, "Norm2HiWarn treshold  : %d\n",
+		   result.norm2hiwarn_treshold);
+	seq_printf(seq, "High2HiWarn treshold  : %d\n",
+		   result.high2hiwarn_treshold);
+	seq_printf(seq, "HiWarn2High treshold  : %d\n",
+		   result.hiwarn2high_treshold);
+	seq_printf(seq, "HiCat2High treshold   : %d\n",
+		   result.hicat2high_treshold);
+	seq_printf(seq, "High2HiCat treshold   : %d\n",
+		   result.hi2hicat_treshold);
+	seq_printf(seq, "Maximum reading       : %d\n", result.maximum_reading);
+
+	seq_printf(seq, "Sensor state          : %d = ", result.sensor_state);
+	switch (result.sensor_state) {
+	case 0:
+		seq_printf(seq, "Normal\n");
+		break;
+	case 1:
+		seq_printf(seq, "Abnormal\n");
+		break;
+	case 2:
+		seq_printf(seq, "Unknown\n");
+		break;
+	case 3:
+		seq_printf(seq, "Low Catastrophic (LoCat)\n");
+		break;
+	case 4:
+		seq_printf(seq, "Low (Low)\n");
+		break;
+	case 5:
+		seq_printf(seq, "Low Warning (LoWarn)\n");
+		break;
+	case 6:
+		seq_printf(seq, "High Warning (HiWarn)\n");
+		break;
+	case 7:
+		seq_printf(seq, "High (High)\n");
+		break;
+	case 8:
+		seq_printf(seq, "High Catastrophic (HiCat)\n");
+		break;
+	}
+
+	seq_printf(seq, "Event_enable : 0x%02X\n", result.event_enable);
+	seq_printf(seq, "    [%s] Operational state change. \n",
+		   (result.event_enable & 0x01) ? "+" : "-");
+	seq_printf(seq, "    [%s] Low catastrophic. \n",
+		   (result.event_enable & 0x02) ? "+" : "-");
+	seq_printf(seq, "    [%s] Low reading. \n",
+		   (result.event_enable & 0x04) ? "+" : "-");
+	seq_printf(seq, "    [%s] Low warning. \n",
+		   (result.event_enable & 0x08) ? "+" : "-");
+	seq_printf(seq,
+		   "    [%s] Change back to normal from out of range state. \n",
+		   (result.event_enable & 0x10) ? "+" : "-");
+	seq_printf(seq, "    [%s] High warning. \n",
+		   (result.event_enable & 0x20) ? "+" : "-");
+	seq_printf(seq, "    [%s] High reading. \n",
+		   (result.event_enable & 0x40) ? "+" : "-");
+	seq_printf(seq, "    [%s] High catastrophic. \n",
+		   (result.event_enable & 0x80) ? "+" : "-");
+
+	return 0;
+}
+
+static int i2o_seq_open_hrt(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_hrt, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_lct(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_lct, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_status(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_status, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_hw(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_hw, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_ddm_table(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_ddm_table, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_driver_store(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_driver_store, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_drivers_stored(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_drivers_stored, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_groups(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_groups, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_phys_device(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_phys_device, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_claimed(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_claimed, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_users(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_users, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_priv_msgs(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_priv_msgs, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_authorized_users(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_authorized_users,
+			   PDE_DATA(inode));
+};
+
+static int i2o_seq_open_dev_identity(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_dev_identity, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_ddm_identity(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_ddm_identity, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_uinfo(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_uinfo, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_sgl_limits(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_sgl_limits, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_sensors(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_sensors, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_dev_name(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_dev_name, PDE_DATA(inode));
+};
+
+static const struct file_operations i2o_seq_fops_lct = {
+	.open = i2o_seq_open_lct,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_hrt = {
+	.open = i2o_seq_open_hrt,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_status = {
+	.open = i2o_seq_open_status,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_hw = {
+	.open = i2o_seq_open_hw,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_ddm_table = {
+	.open = i2o_seq_open_ddm_table,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_driver_store = {
+	.open = i2o_seq_open_driver_store,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_drivers_stored = {
+	.open = i2o_seq_open_drivers_stored,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_groups = {
+	.open = i2o_seq_open_groups,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_phys_device = {
+	.open = i2o_seq_open_phys_device,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_claimed = {
+	.open = i2o_seq_open_claimed,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_users = {
+	.open = i2o_seq_open_users,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_priv_msgs = {
+	.open = i2o_seq_open_priv_msgs,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_authorized_users = {
+	.open = i2o_seq_open_authorized_users,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_dev_name = {
+	.open = i2o_seq_open_dev_name,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_dev_identity = {
+	.open = i2o_seq_open_dev_identity,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_ddm_identity = {
+	.open = i2o_seq_open_ddm_identity,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_uinfo = {
+	.open = i2o_seq_open_uinfo,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_sgl_limits = {
+	.open = i2o_seq_open_sgl_limits,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_sensors = {
+	.open = i2o_seq_open_sensors,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+/*
+ * IOP specific entries...write field just in case someone
+ * ever wants one.
+ */
+static i2o_proc_entry i2o_proc_generic_iop_entries[] = {
+	{"hrt", S_IFREG | S_IRUGO, &i2o_seq_fops_hrt},
+	{"lct", S_IFREG | S_IRUGO, &i2o_seq_fops_lct},
+	{"status", S_IFREG | S_IRUGO, &i2o_seq_fops_status},
+	{"hw", S_IFREG | S_IRUGO, &i2o_seq_fops_hw},
+	{"ddm_table", S_IFREG | S_IRUGO, &i2o_seq_fops_ddm_table},
+	{"driver_store", S_IFREG | S_IRUGO, &i2o_seq_fops_driver_store},
+	{"drivers_stored", S_IFREG | S_IRUGO, &i2o_seq_fops_drivers_stored},
+	{NULL, 0, NULL}
+};
+
+/*
+ * Device specific entries
+ */
+static i2o_proc_entry generic_dev_entries[] = {
+	{"groups", S_IFREG | S_IRUGO, &i2o_seq_fops_groups},
+	{"phys_dev", S_IFREG | S_IRUGO, &i2o_seq_fops_phys_device},
+	{"claimed", S_IFREG | S_IRUGO, &i2o_seq_fops_claimed},
+	{"users", S_IFREG | S_IRUGO, &i2o_seq_fops_users},
+	{"priv_msgs", S_IFREG | S_IRUGO, &i2o_seq_fops_priv_msgs},
+	{"authorized_users", S_IFREG | S_IRUGO, &i2o_seq_fops_authorized_users},
+	{"dev_identity", S_IFREG | S_IRUGO, &i2o_seq_fops_dev_identity},
+	{"ddm_identity", S_IFREG | S_IRUGO, &i2o_seq_fops_ddm_identity},
+	{"user_info", S_IFREG | S_IRUGO, &i2o_seq_fops_uinfo},
+	{"sgl_limits", S_IFREG | S_IRUGO, &i2o_seq_fops_sgl_limits},
+	{"sensors", S_IFREG | S_IRUGO, &i2o_seq_fops_sensors},
+	{NULL, 0, NULL}
+};
+
+/*
+ *  Storage unit specific entries (SCSI Periph, BS) with device names
+ */
+static i2o_proc_entry rbs_dev_entries[] = {
+	{"dev_name", S_IFREG | S_IRUGO, &i2o_seq_fops_dev_name},
+	{NULL, 0, NULL}
+};
+
+/**
+ *	i2o_proc_create_entries - Creates proc dir entries
+ *	@dir: proc dir entry under which the entries should be placed
+ *	@i2o_pe: pointer to the entries which should be added
+ *	@data: pointer to I2O controller or device
+ *
+ *	Create proc dir entries for a I2O controller or I2O device.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_proc_create_entries(struct proc_dir_entry *dir,
+				   i2o_proc_entry * i2o_pe, void *data)
+{
+	struct proc_dir_entry *tmp;
+
+	while (i2o_pe->name) {
+		tmp = proc_create_data(i2o_pe->name, i2o_pe->mode, dir,
+				       i2o_pe->fops, data);
+		if (!tmp)
+			return -1;
+
+		i2o_pe++;
+	}
+
+	return 0;
+}
+
+/**
+ *	i2o_proc_device_add - Add an I2O device to the proc dir
+ *	@dir: proc dir entry to which the device should be added
+ *	@dev: I2O device which should be added
+ *
+ *	Add an I2O device to the proc dir entry dir and create the entries for
+ *	the device depending on the class of the I2O device.
+ */
+static void i2o_proc_device_add(struct proc_dir_entry *dir,
+				struct i2o_device *dev)
+{
+	char buff[10];
+	struct proc_dir_entry *devdir;
+	i2o_proc_entry *i2o_pe = NULL;
+
+	sprintf(buff, "%03x", dev->lct_data.tid);
+
+	osm_debug("adding device /proc/i2o/%s/%s\n", dev->iop->name, buff);
+
+	devdir = proc_mkdir_data(buff, 0, dir, dev);
+	if (!devdir) {
+		osm_warn("Could not allocate procdir!\n");
+		return;
+	}
+
+	i2o_proc_create_entries(devdir, generic_dev_entries, dev);
+
+	/* Inform core that we want updates about this device's status */
+	switch (dev->lct_data.class_id) {
+	case I2O_CLASS_SCSI_PERIPHERAL:
+	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+		i2o_pe = rbs_dev_entries;
+		break;
+	default:
+		break;
+	}
+	if (i2o_pe)
+		i2o_proc_create_entries(devdir, i2o_pe, dev);
+}
+
+/**
+ *	i2o_proc_iop_add - Add an I2O controller to the i2o proc tree
+ *	@dir: parent proc dir entry
+ *	@c: I2O controller which should be added
+ *
+ *	Add the entries to the parent proc dir entry. Also each device is added
+ *	to the controllers proc dir entry.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_proc_iop_add(struct proc_dir_entry *dir,
+			    struct i2o_controller *c)
+{
+	struct proc_dir_entry *iopdir;
+	struct i2o_device *dev;
+
+	osm_debug("adding IOP /proc/i2o/%s\n", c->name);
+
+	iopdir = proc_mkdir_data(c->name, 0, dir, c);
+	if (!iopdir)
+		return -1;
+
+	i2o_proc_create_entries(iopdir, i2o_proc_generic_iop_entries, c);
+
+	list_for_each_entry(dev, &c->devices, list)
+	    i2o_proc_device_add(iopdir, dev);
+
+	return 0;
+}
+
+/**
+ *	i2o_proc_fs_create - Create the i2o proc fs.
+ *
+ *	Iterate over each I2O controller and create the entries for it.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_proc_fs_create(void)
+{
+	struct i2o_controller *c;
+
+	i2o_proc_dir_root = proc_mkdir("i2o", NULL);
+	if (!i2o_proc_dir_root)
+		return -1;
+
+	list_for_each_entry(c, &i2o_controllers, list)
+	    i2o_proc_iop_add(i2o_proc_dir_root, c);
+
+	return 0;
+};
+
+/**
+ *	i2o_proc_fs_destroy - Cleanup the all i2o proc entries
+ *
+ *	Iterate over each I2O controller and remove the entries for it.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __exit i2o_proc_fs_destroy(void)
+{
+	remove_proc_subtree("i2o", NULL);
+
+	return 0;
+};
+
+/**
+ *	i2o_proc_init - Init function for procfs
+ *
+ *	Registers Proc OSM and creates procfs entries.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_proc_init(void)
+{
+	int rc;
+
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	rc = i2o_driver_register(&i2o_proc_driver);
+	if (rc)
+		return rc;
+
+	rc = i2o_proc_fs_create();
+	if (rc) {
+		i2o_driver_unregister(&i2o_proc_driver);
+		return rc;
+	}
+
+	return 0;
+};
+
+/**
+ *	i2o_proc_exit - Exit function for procfs
+ *
+ *	Unregisters Proc OSM and removes procfs entries.
+ */
+static void __exit i2o_proc_exit(void)
+{
+	i2o_driver_unregister(&i2o_proc_driver);
+	i2o_proc_fs_destroy();
+};
+
+MODULE_AUTHOR("Deepak Saxena");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+module_init(i2o_proc_init);
+module_exit(i2o_proc_exit);
diff --git a/drivers/staging/i2o/i2o_scsi.c b/drivers/staging/i2o/i2o_scsi.c
new file mode 100644
index 000000000000..1b11dcb3faea
--- /dev/null
+++ b/drivers/staging/i2o/i2o_scsi.c
@@ -0,0 +1,814 @@
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * For the avoidance of doubt the "preferred form" of this code is one which
+ * is in an open non patent encumbered format. Where cryptographic key signing
+ * forms part of the process of creating an executable the information
+ * including keys needed to generate an equivalently functional executable
+ * are deemed to be part of the source code.
+ *
+ *  Complications for I2O scsi
+ *
+ *	o	Each (bus,lun) is a logical device in I2O. We keep a map
+ *		table. We spoof failed selection for unmapped units
+ *	o	Request sense buffers can come back for free.
+ *	o	Scatter gather is a bit dynamic. We have to investigate at
+ *		setup time.
+ *	o	Some of our resources are dynamically shared. The i2o core
+ *		needs a message reservation protocol to avoid swap v net
+ *		deadlocking. We need to back off queue requests.
+ *
+ *	In general the firmware wants to help. Where its help isn't performance
+ *	useful we just ignore the aid. Its not worth the code in truth.
+ *
+ * Fixes/additions:
+ *	Steve Ralston:
+ *		Scatter gather now works
+ *	Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *		Minor fixes for 2.6.
+ *
+ * To Do:
+ *	64bit cleanups
+ *	Fix the resource management problems.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ioport.h>
+#include <linux/jiffies.h>
+#include <linux/interrupt.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/proc_fs.h>
+#include <linux/prefetch.h>
+#include <linux/pci.h>
+#include <linux/blkdev.h>
+#include "i2o.h"
+#include <linux/scatterlist.h>
+
+#include <asm/dma.h>
+#include <asm/io.h>
+#include <linux/atomic.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/sg.h>
+
+#define OSM_NAME	"scsi-osm"
+#define OSM_VERSION	"1.316"
+#define OSM_DESCRIPTION	"I2O SCSI Peripheral OSM"
+
+static struct i2o_driver i2o_scsi_driver;
+
+static unsigned int i2o_scsi_max_id = 16;
+static unsigned int i2o_scsi_max_lun = 255;
+
+struct i2o_scsi_host {
+	struct Scsi_Host *scsi_host;	/* pointer to the SCSI host */
+	struct i2o_controller *iop;	/* pointer to the I2O controller */
+	u64 lun;	/* lun's used for block devices */
+	struct i2o_device *channel[0];	/* channel->i2o_dev mapping table */
+};
+
+static struct scsi_host_template i2o_scsi_host_template;
+
+#define I2O_SCSI_CAN_QUEUE	4
+
+/* SCSI OSM class handling definition */
+static struct i2o_class_id i2o_scsi_class_id[] = {
+	{I2O_CLASS_SCSI_PERIPHERAL},
+	{I2O_CLASS_END}
+};
+
+static struct i2o_scsi_host *i2o_scsi_host_alloc(struct i2o_controller *c)
+{
+	struct i2o_scsi_host *i2o_shost;
+	struct i2o_device *i2o_dev;
+	struct Scsi_Host *scsi_host;
+	int max_channel = 0;
+	u8 type;
+	int i;
+	size_t size;
+	u16 body_size = 6;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec)
+		body_size = 8;
+#endif
+
+	list_for_each_entry(i2o_dev, &c->devices, list)
+	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
+		if (!i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
+		    && (type == 0x01))	/* SCSI bus */
+			max_channel++;
+	}
+
+	if (!max_channel) {
+		osm_warn("no channels found on %s\n", c->name);
+		return ERR_PTR(-EFAULT);
+	}
+
+	size = max_channel * sizeof(struct i2o_device *)
+	    + sizeof(struct i2o_scsi_host);
+
+	scsi_host = scsi_host_alloc(&i2o_scsi_host_template, size);
+	if (!scsi_host) {
+		osm_warn("Could not allocate SCSI host\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	scsi_host->max_channel = max_channel - 1;
+	scsi_host->max_id = i2o_scsi_max_id;
+	scsi_host->max_lun = i2o_scsi_max_lun;
+	scsi_host->this_id = c->unit;
+	scsi_host->sg_tablesize = i2o_sg_tablesize(c, body_size);
+
+	i2o_shost = (struct i2o_scsi_host *)scsi_host->hostdata;
+	i2o_shost->scsi_host = scsi_host;
+	i2o_shost->iop = c;
+	i2o_shost->lun = 1;
+
+	i = 0;
+	list_for_each_entry(i2o_dev, &c->devices, list)
+	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
+		if (!i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
+		    && (type == 0x01))	/* only SCSI bus */
+			i2o_shost->channel[i++] = i2o_dev;
+
+		if (i >= max_channel)
+			break;
+	}
+
+	return i2o_shost;
+};
+
+/**
+ *	i2o_scsi_get_host - Get an I2O SCSI host
+ *	@c: I2O controller to for which to get the SCSI host
+ *
+ *	If the I2O controller already exists as SCSI host, the SCSI host
+ *	is returned, otherwise the I2O controller is added to the SCSI
+ *	core.
+ *
+ *	Returns pointer to the I2O SCSI host on success or NULL on failure.
+ */
+static struct i2o_scsi_host *i2o_scsi_get_host(struct i2o_controller *c)
+{
+	return c->driver_data[i2o_scsi_driver.context];
+};
+
+/**
+ *	i2o_scsi_remove - Remove I2O device from SCSI core
+ *	@dev: device which should be removed
+ *
+ *	Removes the I2O device from the SCSI core again.
+ *
+ *	Returns 0 on success.
+ */
+static int i2o_scsi_remove(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+	struct i2o_controller *c = i2o_dev->iop;
+	struct i2o_scsi_host *i2o_shost;
+	struct scsi_device *scsi_dev;
+
+	osm_info("device removed (TID: %03x)\n", i2o_dev->lct_data.tid);
+
+	i2o_shost = i2o_scsi_get_host(c);
+
+	shost_for_each_device(scsi_dev, i2o_shost->scsi_host)
+	    if (scsi_dev->hostdata == i2o_dev) {
+		sysfs_remove_link(&i2o_dev->device.kobj, "scsi");
+		scsi_remove_device(scsi_dev);
+		scsi_device_put(scsi_dev);
+		break;
+	}
+
+	return 0;
+};
+
+/**
+ *	i2o_scsi_probe - verify if dev is a I2O SCSI device and install it
+ *	@dev: device to verify if it is a I2O SCSI device
+ *
+ *	Retrieve channel, id and lun for I2O device. If everything goes well
+ *	register the I2O device as SCSI device on the I2O SCSI controller.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_scsi_probe(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+	struct i2o_controller *c = i2o_dev->iop;
+	struct i2o_scsi_host *i2o_shost;
+	struct Scsi_Host *scsi_host;
+	struct i2o_device *parent;
+	struct scsi_device *scsi_dev;
+	u32 id = -1;
+	u64 lun = -1;
+	int channel = -1;
+	int i, rc;
+
+	i2o_shost = i2o_scsi_get_host(c);
+	if (!i2o_shost)
+		return -EFAULT;
+
+	scsi_host = i2o_shost->scsi_host;
+
+	switch (i2o_dev->lct_data.class_id) {
+	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+	case I2O_CLASS_EXECUTIVE:
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+		if (c->adaptec) {
+			u8 type;
+			struct i2o_device *d = i2o_shost->channel[0];
+
+			if (!i2o_parm_field_get(d, 0x0000, 0, &type, 1)
+			    && (type == 0x01))	/* SCSI bus */
+				if (!i2o_parm_field_get(d, 0x0200, 4, &id, 4)) {
+					channel = 0;
+					if (i2o_dev->lct_data.class_id ==
+					    I2O_CLASS_RANDOM_BLOCK_STORAGE)
+						lun =
+						    cpu_to_le64(i2o_shost->
+								lun++);
+					else
+						lun = 0;
+				}
+		}
+#endif
+		break;
+
+	case I2O_CLASS_SCSI_PERIPHERAL:
+		if (i2o_parm_field_get(i2o_dev, 0x0000, 3, &id, 4))
+			return -EFAULT;
+
+		if (i2o_parm_field_get(i2o_dev, 0x0000, 4, &lun, 8))
+			return -EFAULT;
+
+		parent = i2o_iop_find_device(c, i2o_dev->lct_data.parent_tid);
+		if (!parent) {
+			osm_warn("can not find parent of device %03x\n",
+				 i2o_dev->lct_data.tid);
+			return -EFAULT;
+		}
+
+		for (i = 0; i <= i2o_shost->scsi_host->max_channel; i++)
+			if (i2o_shost->channel[i] == parent)
+				channel = i;
+		break;
+
+	default:
+		return -EFAULT;
+	}
+
+	if (channel == -1) {
+		osm_warn("can not find channel of device %03x\n",
+			 i2o_dev->lct_data.tid);
+		return -EFAULT;
+	}
+
+	if (le32_to_cpu(id) >= scsi_host->max_id) {
+		osm_warn("SCSI device id (%d) >= max_id of I2O host (%d)",
+			 le32_to_cpu(id), scsi_host->max_id);
+		return -EFAULT;
+	}
+
+	if (le64_to_cpu(lun) >= scsi_host->max_lun) {
+		osm_warn("SCSI device lun (%llu) >= max_lun of I2O host (%llu)",
+			 le64_to_cpu(lun), scsi_host->max_lun);
+		return -EFAULT;
+	}
+
+	scsi_dev =
+	    __scsi_add_device(i2o_shost->scsi_host, channel, le32_to_cpu(id),
+			      le64_to_cpu(lun), i2o_dev);
+
+	if (IS_ERR(scsi_dev)) {
+		osm_warn("can not add SCSI device %03x\n",
+			 i2o_dev->lct_data.tid);
+		return PTR_ERR(scsi_dev);
+	}
+
+	rc = sysfs_create_link(&i2o_dev->device.kobj,
+			       &scsi_dev->sdev_gendev.kobj, "scsi");
+	if (rc)
+		goto err;
+
+	osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %llu\n",
+		 i2o_dev->lct_data.tid, channel, le32_to_cpu(id),
+		 le64_to_cpu(lun));
+
+	return 0;
+
+err:
+	scsi_remove_device(scsi_dev);
+	return rc;
+};
+
+static const char *i2o_scsi_info(struct Scsi_Host *SChost)
+{
+	struct i2o_scsi_host *hostdata;
+	hostdata = (struct i2o_scsi_host *)SChost->hostdata;
+	return hostdata->iop->name;
+}
+
+/**
+ *	i2o_scsi_reply - SCSI OSM message reply handler
+ *	@c: controller issuing the reply
+ *	@m: message id for flushing
+ *	@msg: the message from the controller
+ *
+ *	Process reply messages (interrupts in normal scsi controller think).
+ *	We can get a variety of messages to process. The normal path is
+ *	scsi command completions. We must also deal with IOP failures,
+ *	the reply to a bus reset and the reply to a LUN query.
+ *
+ *	Returns 0 on success and if the reply should not be flushed or > 0
+ *	on success and if the reply should be flushed. Returns negative error
+ *	code on failure and if the reply should be flushed.
+ */
+static int i2o_scsi_reply(struct i2o_controller *c, u32 m,
+			  struct i2o_message *msg)
+{
+	struct scsi_cmnd *cmd;
+	u32 error;
+	struct device *dev;
+
+	cmd = i2o_cntxt_list_get(c, le32_to_cpu(msg->u.s.tcntxt));
+	if (unlikely(!cmd)) {
+		osm_err("NULL reply received!\n");
+		return -1;
+	}
+
+	/*
+	 *      Low byte is device status, next is adapter status,
+	 *      (then one byte reserved), then request status.
+	 */
+	error = le32_to_cpu(msg->body[0]);
+
+	osm_debug("Completed %0x%p\n", cmd);
+
+	cmd->result = error & 0xff;
+	/*
+	 * if DeviceStatus is not SCSI_SUCCESS copy over the sense data and let
+	 * the SCSI layer handle the error
+	 */
+	if (cmd->result)
+		memcpy(cmd->sense_buffer, &msg->body[3],
+		       min(SCSI_SENSE_BUFFERSIZE, 40));
+
+	/* only output error code if AdapterStatus is not HBA_SUCCESS */
+	if ((error >> 8) & 0xff)
+		osm_err("SCSI error %08x\n", error);
+
+	dev = &c->pdev->dev;
+
+	scsi_dma_unmap(cmd);
+
+	cmd->scsi_done(cmd);
+
+	return 1;
+};
+
+/**
+ *	i2o_scsi_notify_device_add - Retrieve notifications of added devices
+ *	@i2o_dev: the I2O device which was added
+ *
+ *	If a I2O device is added we catch the notification, because I2O classes
+ *	other than SCSI peripheral will not be received through
+ *	i2o_scsi_probe().
+ */
+static void i2o_scsi_notify_device_add(struct i2o_device *i2o_dev)
+{
+	switch (i2o_dev->lct_data.class_id) {
+	case I2O_CLASS_EXECUTIVE:
+	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+		i2o_scsi_probe(&i2o_dev->device);
+		break;
+
+	default:
+		break;
+	}
+};
+
+/**
+ *	i2o_scsi_notify_device_remove - Retrieve notifications of removed devices
+ *	@i2o_dev: the I2O device which was removed
+ *
+ *	If a I2O device is removed, we catch the notification to remove the
+ *	corresponding SCSI device.
+ */
+static void i2o_scsi_notify_device_remove(struct i2o_device *i2o_dev)
+{
+	switch (i2o_dev->lct_data.class_id) {
+	case I2O_CLASS_EXECUTIVE:
+	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+		i2o_scsi_remove(&i2o_dev->device);
+		break;
+
+	default:
+		break;
+	}
+};
+
+/**
+ *	i2o_scsi_notify_controller_add - Retrieve notifications of added controllers
+ *	@c: the controller which was added
+ *
+ *	If a I2O controller is added, we catch the notification to add a
+ *	corresponding Scsi_Host.
+ */
+static void i2o_scsi_notify_controller_add(struct i2o_controller *c)
+{
+	struct i2o_scsi_host *i2o_shost;
+	int rc;
+
+	i2o_shost = i2o_scsi_host_alloc(c);
+	if (IS_ERR(i2o_shost)) {
+		osm_err("Could not initialize SCSI host\n");
+		return;
+	}
+
+	rc = scsi_add_host(i2o_shost->scsi_host, &c->device);
+	if (rc) {
+		osm_err("Could not add SCSI host\n");
+		scsi_host_put(i2o_shost->scsi_host);
+		return;
+	}
+
+	c->driver_data[i2o_scsi_driver.context] = i2o_shost;
+
+	osm_debug("new I2O SCSI host added\n");
+};
+
+/**
+ *	i2o_scsi_notify_controller_remove - Retrieve notifications of removed controllers
+ *	@c: the controller which was removed
+ *
+ *	If a I2O controller is removed, we catch the notification to remove the
+ *	corresponding Scsi_Host.
+ */
+static void i2o_scsi_notify_controller_remove(struct i2o_controller *c)
+{
+	struct i2o_scsi_host *i2o_shost;
+	i2o_shost = i2o_scsi_get_host(c);
+	if (!i2o_shost)
+		return;
+
+	c->driver_data[i2o_scsi_driver.context] = NULL;
+
+	scsi_remove_host(i2o_shost->scsi_host);
+	scsi_host_put(i2o_shost->scsi_host);
+	osm_debug("I2O SCSI host removed\n");
+};
+
+/* SCSI OSM driver struct */
+static struct i2o_driver i2o_scsi_driver = {
+	.name = OSM_NAME,
+	.reply = i2o_scsi_reply,
+	.classes = i2o_scsi_class_id,
+	.notify_device_add = i2o_scsi_notify_device_add,
+	.notify_device_remove = i2o_scsi_notify_device_remove,
+	.notify_controller_add = i2o_scsi_notify_controller_add,
+	.notify_controller_remove = i2o_scsi_notify_controller_remove,
+	.driver = {
+		   .probe = i2o_scsi_probe,
+		   .remove = i2o_scsi_remove,
+		   },
+};
+
+/**
+ *	i2o_scsi_queuecommand - queue a SCSI command
+ *	@SCpnt: scsi command pointer
+ *	@done: callback for completion
+ *
+ *	Issue a scsi command asynchronously. Return 0 on success or 1 if
+ *	we hit an error (normally message queue congestion). The only
+ *	minor complication here is that I2O deals with the device addressing
+ *	so we have to map the bus/dev/lun back to an I2O handle as well
+ *	as faking absent devices ourself.
+ *
+ *	Locks: takes the controller lock on error path only
+ */
+
+static int i2o_scsi_queuecommand_lck(struct scsi_cmnd *SCpnt,
+				 void (*done) (struct scsi_cmnd *))
+{
+	struct i2o_controller *c;
+	struct i2o_device *i2o_dev;
+	int tid;
+	struct i2o_message *msg;
+	/*
+	 * ENABLE_DISCONNECT
+	 * SIMPLE_TAG
+	 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME
+	 */
+	u32 scsi_flags = 0x20a00000;
+	u32 sgl_offset;
+	u32 *mptr;
+	u32 cmd = I2O_CMD_SCSI_EXEC << 24;
+	int rc = 0;
+
+	/*
+	 *      Do the incoming paperwork
+	 */
+	i2o_dev = SCpnt->device->hostdata;
+
+	SCpnt->scsi_done = done;
+
+	if (unlikely(!i2o_dev)) {
+		osm_warn("no I2O device in request\n");
+		SCpnt->result = DID_NO_CONNECT << 16;
+		done(SCpnt);
+		goto exit;
+	}
+	c = i2o_dev->iop;
+	tid = i2o_dev->lct_data.tid;
+
+	osm_debug("qcmd: Tid = %03x\n", tid);
+	osm_debug("Real scsi messages.\n");
+
+	/*
+	 *      Put together a scsi execscb message
+	 */
+	switch (SCpnt->sc_data_direction) {
+	case PCI_DMA_NONE:
+		/* DATA NO XFER */
+		sgl_offset = SGL_OFFSET_0;
+		break;
+
+	case PCI_DMA_TODEVICE:
+		/* DATA OUT (iop-->dev) */
+		scsi_flags |= 0x80000000;
+		sgl_offset = SGL_OFFSET_10;
+		break;
+
+	case PCI_DMA_FROMDEVICE:
+		/* DATA IN  (iop<--dev) */
+		scsi_flags |= 0x40000000;
+		sgl_offset = SGL_OFFSET_10;
+		break;
+
+	default:
+		/* Unknown - kill the command */
+		SCpnt->result = DID_NO_CONNECT << 16;
+		done(SCpnt);
+		goto exit;
+	}
+
+	/*
+	 *      Obtain an I2O message. If there are none free then
+	 *      throw it back to the scsi layer
+	 */
+
+	msg = i2o_msg_get(c);
+	if (IS_ERR(msg)) {
+		rc = SCSI_MLQUEUE_HOST_BUSY;
+		goto exit;
+	}
+
+	mptr = &msg->body[0];
+
+#if 0 /* this code can't work */
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec) {
+		u32 adpt_flags = 0;
+
+		if (SCpnt->sc_request && SCpnt->sc_request->upper_private_data) {
+			i2o_sg_io_hdr_t __user *usr_ptr =
+			    ((Sg_request *) (SCpnt->sc_request->
+					     upper_private_data))->header.
+			    usr_ptr;
+
+			if (usr_ptr)
+				get_user(adpt_flags, &usr_ptr->flags);
+		}
+
+		switch (i2o_dev->lct_data.class_id) {
+		case I2O_CLASS_EXECUTIVE:
+		case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+			/* interpret flag has to be set for executive */
+			adpt_flags ^= I2O_DPT_SG_FLAG_INTERPRET;
+			break;
+
+		default:
+			break;
+		}
+
+		/*
+		 * for Adaptec controllers we use the PRIVATE command, because
+		 * the normal SCSI EXEC doesn't support all SCSI commands on
+		 * all controllers (for example READ CAPACITY).
+		 */
+		if (sgl_offset == SGL_OFFSET_10)
+			sgl_offset = SGL_OFFSET_12;
+		cmd = I2O_CMD_PRIVATE << 24;
+		*mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
+		*mptr++ = cpu_to_le32(adpt_flags | tid);
+	}
+#endif
+#endif
+
+	msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
+	msg->u.s.icntxt = cpu_to_le32(i2o_scsi_driver.context);
+
+	/* We want the SCSI control block back */
+	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, SCpnt));
+
+	/* LSI_920_PCI_QUIRK
+	 *
+	 *      Intermittant observations of msg frame word data corruption
+	 *      observed on msg[4] after:
+	 *        WRITE, READ-MODIFY-WRITE
+	 *      operations.  19990606 -sralston
+	 *
+	 *      (Hence we build this word via tag. Its good practice anyway
+	 *       we don't want fetches over PCI needlessly)
+	 */
+
+	/* Attach tags to the devices */
+	/* FIXME: implement
+	   if(SCpnt->device->tagged_supported) {
+	   if(SCpnt->tag == HEAD_OF_QUEUE_TAG)
+	   scsi_flags |= 0x01000000;
+	   else if(SCpnt->tag == ORDERED_QUEUE_TAG)
+	   scsi_flags |= 0x01800000;
+	   }
+	 */
+
+	*mptr++ = cpu_to_le32(scsi_flags | SCpnt->cmd_len);
+
+	/* Write SCSI command into the message - always 16 byte block */
+	memcpy(mptr, SCpnt->cmnd, 16);
+	mptr += 4;
+
+	if (sgl_offset != SGL_OFFSET_0) {
+		/* write size of data addressed by SGL */
+		*mptr++ = cpu_to_le32(scsi_bufflen(SCpnt));
+
+		/* Now fill in the SGList and command */
+
+		if (scsi_sg_count(SCpnt)) {
+			if (!i2o_dma_map_sg(c, scsi_sglist(SCpnt),
+					    scsi_sg_count(SCpnt),
+					    SCpnt->sc_data_direction, &mptr))
+				goto nomem;
+		}
+	}
+
+	/* Stick the headers on */
+	msg->u.head[0] =
+	    cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
+
+	/* Queue the message */
+	i2o_msg_post(c, msg);
+
+	osm_debug("Issued %0x%p\n", SCpnt);
+
+	return 0;
+
+      nomem:
+	rc = -ENOMEM;
+	i2o_msg_nop(c, msg);
+
+      exit:
+	return rc;
+}
+
+static DEF_SCSI_QCMD(i2o_scsi_queuecommand)
+
+/**
+ *	i2o_scsi_abort - abort a running command
+ *	@SCpnt: command to abort
+ *
+ *	Ask the I2O controller to abort a command. This is an asynchrnous
+ *	process and our callback handler will see the command complete with an
+ *	aborted message if it succeeds.
+ *
+ *	Returns 0 if the command is successfully aborted or negative error code
+ *	on failure.
+ */
+static int i2o_scsi_abort(struct scsi_cmnd *SCpnt)
+{
+	struct i2o_device *i2o_dev;
+	struct i2o_controller *c;
+	struct i2o_message *msg;
+	int tid;
+	int status = FAILED;
+
+	osm_warn("Aborting command block.\n");
+
+	i2o_dev = SCpnt->device->hostdata;
+	c = i2o_dev->iop;
+	tid = i2o_dev->lct_data.tid;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return SCSI_MLQUEUE_HOST_BUSY;
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SCSI_ABORT << 24 | HOST_TID << 12 | tid);
+	msg->body[0] = cpu_to_le32(i2o_cntxt_list_get_ptr(c, SCpnt));
+
+	if (!i2o_msg_post_wait(c, msg, I2O_TIMEOUT_SCSI_SCB_ABORT))
+		status = SUCCESS;
+
+	return status;
+}
+
+/**
+ *	i2o_scsi_bios_param	-	Invent disk geometry
+ *	@sdev: scsi device
+ *	@dev: block layer device
+ *	@capacity: size in sectors
+ *	@ip: geometry array
+ *
+ *	This is anyone's guess quite frankly. We use the same rules everyone
+ *	else appears to and hope. It seems to work.
+ */
+
+static int i2o_scsi_bios_param(struct scsi_device *sdev,
+			       struct block_device *dev, sector_t capacity,
+			       int *ip)
+{
+	int size;
+
+	size = capacity;
+	ip[0] = 64;		/* heads                        */
+	ip[1] = 32;		/* sectors                      */
+	if ((ip[2] = size >> 11) > 1024) {	/* cylinders, test for big disk */
+		ip[0] = 255;	/* heads                        */
+		ip[1] = 63;	/* sectors                      */
+		ip[2] = size / (255 * 63);	/* cylinders                    */
+	}
+	return 0;
+}
+
+static struct scsi_host_template i2o_scsi_host_template = {
+	.proc_name = OSM_NAME,
+	.name = OSM_DESCRIPTION,
+	.info = i2o_scsi_info,
+	.queuecommand = i2o_scsi_queuecommand,
+	.eh_abort_handler = i2o_scsi_abort,
+	.bios_param = i2o_scsi_bios_param,
+	.can_queue = I2O_SCSI_CAN_QUEUE,
+	.sg_tablesize = 8,
+	.cmd_per_lun = 6,
+	.use_clustering = ENABLE_CLUSTERING,
+};
+
+/**
+ *	i2o_scsi_init - SCSI OSM initialization function
+ *
+ *	Register SCSI OSM into I2O core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_scsi_init(void)
+{
+	int rc;
+
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	/* Register SCSI OSM into I2O core */
+	rc = i2o_driver_register(&i2o_scsi_driver);
+	if (rc) {
+		osm_err("Could not register SCSI driver\n");
+		return rc;
+	}
+
+	return 0;
+};
+
+/**
+ *	i2o_scsi_exit - SCSI OSM exit function
+ *
+ *	Unregisters SCSI OSM from I2O core.
+ */
+static void __exit i2o_scsi_exit(void)
+{
+	/* Unregister I2O SCSI OSM from I2O core */
+	i2o_driver_unregister(&i2o_scsi_driver);
+};
+
+MODULE_AUTHOR("Red Hat Software");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+module_init(i2o_scsi_init);
+module_exit(i2o_scsi_exit);
diff --git a/drivers/staging/i2o/iop.c b/drivers/staging/i2o/iop.c
new file mode 100644
index 000000000000..52334fc8b547
--- /dev/null
+++ b/drivers/staging/i2o/iop.c
@@ -0,0 +1,1247 @@
+/*
+ *	Functions to handle I2O controllers and I2O message handling
+ *
+ *	Copyright (C) 1999-2002	Red Hat Software
+ *
+ *	Written by Alan Cox, Building Number Three Ltd
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	A lot of the I2O message side code from this is taken from the
+ *	Red Creek RCPCI45 adapter driver by Red Creek Communications
+ *
+ *	Fixes/additions:
+ *		Philipp Rumpf
+ *		Juha Sievänen <Juha.Sievanen@cs.Helsinki.FI>
+ *		Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
+ *		Deepak Saxena <deepak@plexity.net>
+ *		Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>:
+ *			Ported to Linux 2.5.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Minor fixes for 2.6.
+ */
+
+#include <linux/module.h>
+#include "i2o.h"
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "core.h"
+
+#define OSM_NAME	"i2o"
+#define OSM_VERSION	"1.325"
+#define OSM_DESCRIPTION	"I2O subsystem"
+
+/* global I2O controller list */
+LIST_HEAD(i2o_controllers);
+
+/*
+ * global I2O System Table. Contains information about all the IOPs in the
+ * system. Used to inform IOPs about each others existence.
+ */
+static struct i2o_dma i2o_systab;
+
+static int i2o_hrt_get(struct i2o_controller *c);
+
+/**
+ *	i2o_msg_get_wait - obtain an I2O message from the IOP
+ *	@c: I2O controller
+ *	@wait: how long to wait until timeout
+ *
+ *	This function waits up to wait seconds for a message slot to be
+ *	available.
+ *
+ *	On a success the message is returned and the pointer to the message is
+ *	set in msg. The returned message is the physical page frame offset
+ *	address from the read port (see the i2o spec). If no message is
+ *	available returns I2O_QUEUE_EMPTY and msg is leaved untouched.
+ */
+struct i2o_message *i2o_msg_get_wait(struct i2o_controller *c, int wait)
+{
+	unsigned long timeout = jiffies + wait * HZ;
+	struct i2o_message *msg;
+
+	while (IS_ERR(msg = i2o_msg_get(c))) {
+		if (time_after(jiffies, timeout)) {
+			osm_debug("%s: Timeout waiting for message frame.\n",
+				  c->name);
+			return ERR_PTR(-ETIMEDOUT);
+		}
+		schedule_timeout_uninterruptible(1);
+	}
+
+	return msg;
+};
+
+#if BITS_PER_LONG == 64
+/**
+ *      i2o_cntxt_list_add - Append a pointer to context list and return a id
+ *	@c: controller to which the context list belong
+ *	@ptr: pointer to add to the context list
+ *
+ *	Because the context field in I2O is only 32-bit large, on 64-bit the
+ *	pointer is to large to fit in the context field. The i2o_cntxt_list
+ *	functions therefore map pointers to context fields.
+ *
+ *	Returns context id > 0 on success or 0 on failure.
+ */
+u32 i2o_cntxt_list_add(struct i2o_controller * c, void *ptr)
+{
+	struct i2o_context_list_element *entry;
+	unsigned long flags;
+
+	if (!ptr)
+		osm_err("%s: couldn't add NULL pointer to context list!\n",
+			c->name);
+
+	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry) {
+		osm_err("%s: Could not allocate memory for context list element"
+			"\n", c->name);
+		return 0;
+	}
+
+	entry->ptr = ptr;
+	entry->timestamp = jiffies;
+	INIT_LIST_HEAD(&entry->list);
+
+	spin_lock_irqsave(&c->context_list_lock, flags);
+
+	if (unlikely(atomic_inc_and_test(&c->context_list_counter)))
+		atomic_inc(&c->context_list_counter);
+
+	entry->context = atomic_read(&c->context_list_counter);
+
+	list_add(&entry->list, &c->context_list);
+
+	spin_unlock_irqrestore(&c->context_list_lock, flags);
+
+	osm_debug("%s: Add context to list %p -> %d\n", c->name, ptr, context);
+
+	return entry->context;
+};
+
+/**
+ *      i2o_cntxt_list_remove - Remove a pointer from the context list
+ *	@c: controller to which the context list belong
+ *	@ptr: pointer which should be removed from the context list
+ *
+ *	Removes a previously added pointer from the context list and returns
+ *	the matching context id.
+ *
+ *	Returns context id on success or 0 on failure.
+ */
+u32 i2o_cntxt_list_remove(struct i2o_controller * c, void *ptr)
+{
+	struct i2o_context_list_element *entry;
+	u32 context = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c->context_list_lock, flags);
+	list_for_each_entry(entry, &c->context_list, list)
+	    if (entry->ptr == ptr) {
+		list_del(&entry->list);
+		context = entry->context;
+		kfree(entry);
+		break;
+	}
+	spin_unlock_irqrestore(&c->context_list_lock, flags);
+
+	if (!context)
+		osm_warn("%s: Could not remove nonexistent ptr %p\n", c->name,
+			 ptr);
+
+	osm_debug("%s: remove ptr from context list %d -> %p\n", c->name,
+		  context, ptr);
+
+	return context;
+};
+
+/**
+ *      i2o_cntxt_list_get - Get a pointer from the context list and remove it
+ *	@c: controller to which the context list belong
+ *	@context: context id to which the pointer belong
+ *
+ *	Returns pointer to the matching context id on success or NULL on
+ *	failure.
+ */
+void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
+{
+	struct i2o_context_list_element *entry;
+	unsigned long flags;
+	void *ptr = NULL;
+
+	spin_lock_irqsave(&c->context_list_lock, flags);
+	list_for_each_entry(entry, &c->context_list, list)
+	    if (entry->context == context) {
+		list_del(&entry->list);
+		ptr = entry->ptr;
+		kfree(entry);
+		break;
+	}
+	spin_unlock_irqrestore(&c->context_list_lock, flags);
+
+	if (!ptr)
+		osm_warn("%s: context id %d not found\n", c->name, context);
+
+	osm_debug("%s: get ptr from context list %d -> %p\n", c->name, context,
+		  ptr);
+
+	return ptr;
+};
+
+/**
+ *      i2o_cntxt_list_get_ptr - Get a context id from the context list
+ *	@c: controller to which the context list belong
+ *	@ptr: pointer to which the context id should be fetched
+ *
+ *	Returns context id which matches to the pointer on success or 0 on
+ *	failure.
+ */
+u32 i2o_cntxt_list_get_ptr(struct i2o_controller * c, void *ptr)
+{
+	struct i2o_context_list_element *entry;
+	u32 context = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c->context_list_lock, flags);
+	list_for_each_entry(entry, &c->context_list, list)
+	    if (entry->ptr == ptr) {
+		context = entry->context;
+		break;
+	}
+	spin_unlock_irqrestore(&c->context_list_lock, flags);
+
+	if (!context)
+		osm_warn("%s: Could not find nonexistent ptr %p\n", c->name,
+			 ptr);
+
+	osm_debug("%s: get context id from context list %p -> %d\n", c->name,
+		  ptr, context);
+
+	return context;
+};
+#endif
+
+/**
+ *	i2o_iop_find - Find an I2O controller by id
+ *	@unit: unit number of the I2O controller to search for
+ *
+ *	Lookup the I2O controller on the controller list.
+ *
+ *	Returns pointer to the I2O controller on success or NULL if not found.
+ */
+struct i2o_controller *i2o_find_iop(int unit)
+{
+	struct i2o_controller *c;
+
+	list_for_each_entry(c, &i2o_controllers, list) {
+		if (c->unit == unit)
+			return c;
+	}
+
+	return NULL;
+};
+
+/**
+ *	i2o_iop_find_device - Find a I2O device on an I2O controller
+ *	@c: I2O controller where the I2O device hangs on
+ *	@tid: TID of the I2O device to search for
+ *
+ *	Searches the devices of the I2O controller for a device with TID tid and
+ *	returns it.
+ *
+ *	Returns a pointer to the I2O device if found, otherwise NULL.
+ */
+struct i2o_device *i2o_iop_find_device(struct i2o_controller *c, u16 tid)
+{
+	struct i2o_device *dev;
+
+	list_for_each_entry(dev, &c->devices, list)
+	    if (dev->lct_data.tid == tid)
+		return dev;
+
+	return NULL;
+};
+
+/**
+ *	i2o_quiesce_controller - quiesce controller
+ *	@c: controller
+ *
+ *	Quiesce an IOP. Causes IOP to make external operation quiescent
+ *	(i2o 'READY' state). Internal operation of the IOP continues normally.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_iop_quiesce(struct i2o_controller *c)
+{
+	struct i2o_message *msg;
+	i2o_status_block *sb = c->status_block.virt;
+	int rc;
+
+	i2o_status_get(c);
+
+	/* SysQuiesce discarded if IOP not in READY or OPERATIONAL state */
+	if ((sb->iop_state != ADAPTER_STATE_READY) &&
+	    (sb->iop_state != ADAPTER_STATE_OPERATIONAL))
+		return 0;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SYS_QUIESCE << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+
+	/* Long timeout needed for quiesce if lots of devices */
+	if ((rc = i2o_msg_post_wait(c, msg, 240)))
+		osm_info("%s: Unable to quiesce (status=%#x).\n", c->name, -rc);
+	else
+		osm_debug("%s: Quiesced.\n", c->name);
+
+	i2o_status_get(c);	// Entered READY state
+
+	return rc;
+};
+
+/**
+ *	i2o_iop_enable - move controller from ready to OPERATIONAL
+ *	@c: I2O controller
+ *
+ *	Enable IOP. This allows the IOP to resume external operations and
+ *	reverses the effect of a quiesce. Returns zero or an error code if
+ *	an error occurs.
+ */
+static int i2o_iop_enable(struct i2o_controller *c)
+{
+	struct i2o_message *msg;
+	i2o_status_block *sb = c->status_block.virt;
+	int rc;
+
+	i2o_status_get(c);
+
+	/* Enable only allowed on READY state */
+	if (sb->iop_state != ADAPTER_STATE_READY)
+		return -EINVAL;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SYS_ENABLE << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+
+	/* How long of a timeout do we need? */
+	if ((rc = i2o_msg_post_wait(c, msg, 240)))
+		osm_err("%s: Could not enable (status=%#x).\n", c->name, -rc);
+	else
+		osm_debug("%s: Enabled.\n", c->name);
+
+	i2o_status_get(c);	// entered OPERATIONAL state
+
+	return rc;
+};
+
+/**
+ *	i2o_iop_quiesce_all - Quiesce all I2O controllers on the system
+ *
+ *	Quiesce all I2O controllers which are connected to the system.
+ */
+static inline void i2o_iop_quiesce_all(void)
+{
+	struct i2o_controller *c, *tmp;
+
+	list_for_each_entry_safe(c, tmp, &i2o_controllers, list) {
+		if (!c->no_quiesce)
+			i2o_iop_quiesce(c);
+	}
+};
+
+/**
+ *	i2o_iop_enable_all - Enables all controllers on the system
+ *
+ *	Enables all I2O controllers which are connected to the system.
+ */
+static inline void i2o_iop_enable_all(void)
+{
+	struct i2o_controller *c, *tmp;
+
+	list_for_each_entry_safe(c, tmp, &i2o_controllers, list)
+	    i2o_iop_enable(c);
+};
+
+/**
+ *	i2o_clear_controller - Bring I2O controller into HOLD state
+ *	@c: controller
+ *
+ *	Clear an IOP to HOLD state, ie. terminate external operations, clear all
+ *	input queues and prepare for a system restart. IOP's internal operation
+ *	continues normally and the outbound queue is alive. The IOP is not
+ *	expected to rebuild its LCT.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_iop_clear(struct i2o_controller *c)
+{
+	struct i2o_message *msg;
+	int rc;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	/* Quiesce all IOPs first */
+	i2o_iop_quiesce_all();
+
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_ADAPTER_CLEAR << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+
+	if ((rc = i2o_msg_post_wait(c, msg, 30)))
+		osm_info("%s: Unable to clear (status=%#x).\n", c->name, -rc);
+	else
+		osm_debug("%s: Cleared.\n", c->name);
+
+	/* Enable all IOPs */
+	i2o_iop_enable_all();
+
+	return rc;
+}
+
+/**
+ *	i2o_iop_init_outbound_queue - setup the outbound message queue
+ *	@c: I2O controller
+ *
+ *	Clear and (re)initialize IOP's outbound queue and post the message
+ *	frames to the IOP.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
+{
+	u32 m;
+	volatile u8 *status = c->status.virt;
+	struct i2o_message *msg;
+	ulong timeout;
+	int i;
+
+	osm_debug("%s: Initializing Outbound Queue...\n", c->name);
+
+	memset(c->status.virt, 0, 4);
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(PAGE_SIZE);
+	/* Outbound msg frame size in words and Initcode */
+	msg->body[1] = cpu_to_le32(I2O_OUTBOUND_MSG_FRAME_SIZE << 16 | 0x80);
+	msg->body[2] = cpu_to_le32(0xd0000004);
+	msg->body[3] = cpu_to_le32(i2o_dma_low(c->status.phys));
+	msg->body[4] = cpu_to_le32(i2o_dma_high(c->status.phys));
+
+	i2o_msg_post(c, msg);
+
+	timeout = jiffies + I2O_TIMEOUT_INIT_OUTBOUND_QUEUE * HZ;
+	while (*status <= I2O_CMD_IN_PROGRESS) {
+		if (time_after(jiffies, timeout)) {
+			osm_warn("%s: Timeout Initializing\n", c->name);
+			return -ETIMEDOUT;
+		}
+		schedule_timeout_uninterruptible(1);
+	}
+
+	m = c->out_queue.phys;
+
+	/* Post frames */
+	for (i = 0; i < I2O_MAX_OUTBOUND_MSG_FRAMES; i++) {
+		i2o_flush_reply(c, m);
+		udelay(1);	/* Promise */
+		m += I2O_OUTBOUND_MSG_FRAME_SIZE * sizeof(u32);
+	}
+
+	return 0;
+}
+
+/**
+ *	i2o_iop_reset - reset an I2O controller
+ *	@c: controller to reset
+ *
+ *	Reset the IOP into INIT state and wait until IOP gets into RESET state.
+ *	Terminate all external operations, clear IOP's inbound and outbound
+ *	queues, terminate all DDMs, and reload the IOP's operating environment
+ *	and all local DDMs. The IOP rebuilds its LCT.
+ */
+static int i2o_iop_reset(struct i2o_controller *c)
+{
+	volatile u8 *status = c->status.virt;
+	struct i2o_message *msg;
+	unsigned long timeout;
+	i2o_status_block *sb = c->status_block.virt;
+	int rc = 0;
+
+	osm_debug("%s: Resetting controller\n", c->name);
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	memset(c->status_block.virt, 0, 8);
+
+	/* Quiesce all IOPs first */
+	i2o_iop_quiesce_all();
+
+	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_ADAPTER_RESET << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(0x00000000);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	msg->body[2] = cpu_to_le32(i2o_dma_low(c->status.phys));
+	msg->body[3] = cpu_to_le32(i2o_dma_high(c->status.phys));
+
+	i2o_msg_post(c, msg);
+
+	/* Wait for a reply */
+	timeout = jiffies + I2O_TIMEOUT_RESET * HZ;
+	while (!*status) {
+		if (time_after(jiffies, timeout))
+			break;
+
+		schedule_timeout_uninterruptible(1);
+	}
+
+	switch (*status) {
+	case I2O_CMD_REJECTED:
+		osm_warn("%s: IOP reset rejected\n", c->name);
+		rc = -EPERM;
+		break;
+
+	case I2O_CMD_IN_PROGRESS:
+		/*
+		 * Once the reset is sent, the IOP goes into the INIT state
+		 * which is indeterminate. We need to wait until the IOP has
+		 * rebooted before we can let the system talk to it. We read
+		 * the inbound Free_List until a message is available. If we
+		 * can't read one in the given amount of time, we assume the
+		 * IOP could not reboot properly.
+		 */
+		osm_debug("%s: Reset in progress, waiting for reboot...\n",
+			  c->name);
+
+		while (IS_ERR(msg = i2o_msg_get_wait(c, I2O_TIMEOUT_RESET))) {
+			if (time_after(jiffies, timeout)) {
+				osm_err("%s: IOP reset timeout.\n", c->name);
+				rc = PTR_ERR(msg);
+				goto exit;
+			}
+			schedule_timeout_uninterruptible(1);
+		}
+		i2o_msg_nop(c, msg);
+
+		/* from here all quiesce commands are safe */
+		c->no_quiesce = 0;
+
+		/* verify if controller is in state RESET */
+		i2o_status_get(c);
+
+		if (!c->promise && (sb->iop_state != ADAPTER_STATE_RESET))
+			osm_warn("%s: reset completed, but adapter not in RESET"
+				 " state.\n", c->name);
+		else
+			osm_debug("%s: reset completed.\n", c->name);
+
+		break;
+
+	default:
+		osm_err("%s: IOP reset timeout.\n", c->name);
+		rc = -ETIMEDOUT;
+		break;
+	}
+
+      exit:
+	/* Enable all IOPs */
+	i2o_iop_enable_all();
+
+	return rc;
+};
+
+/**
+ *	i2o_iop_activate - Bring controller up to HOLD
+ *	@c: controller
+ *
+ *	This function brings an I2O controller into HOLD state. The adapter
+ *	is reset if necessary and then the queues and resource table are read.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_iop_activate(struct i2o_controller *c)
+{
+	i2o_status_block *sb = c->status_block.virt;
+	int rc;
+	int state;
+
+	/* In INIT state, Wait Inbound Q to initialize (in i2o_status_get) */
+	/* In READY state, Get status */
+
+	rc = i2o_status_get(c);
+	if (rc) {
+		osm_info("%s: Unable to obtain status, attempting a reset.\n",
+			 c->name);
+		rc = i2o_iop_reset(c);
+		if (rc)
+			return rc;
+	}
+
+	if (sb->i2o_version > I2OVER15) {
+		osm_err("%s: Not running version 1.5 of the I2O Specification."
+			"\n", c->name);
+		return -ENODEV;
+	}
+
+	switch (sb->iop_state) {
+	case ADAPTER_STATE_FAULTED:
+		osm_err("%s: hardware fault\n", c->name);
+		return -EFAULT;
+
+	case ADAPTER_STATE_READY:
+	case ADAPTER_STATE_OPERATIONAL:
+	case ADAPTER_STATE_HOLD:
+	case ADAPTER_STATE_FAILED:
+		osm_debug("%s: already running, trying to reset...\n", c->name);
+		rc = i2o_iop_reset(c);
+		if (rc)
+			return rc;
+	}
+
+	/* preserve state */
+	state = sb->iop_state;
+
+	rc = i2o_iop_init_outbound_queue(c);
+	if (rc)
+		return rc;
+
+	/* if adapter was not in RESET state clear now */
+	if (state != ADAPTER_STATE_RESET)
+		i2o_iop_clear(c);
+
+	i2o_status_get(c);
+
+	if (sb->iop_state != ADAPTER_STATE_HOLD) {
+		osm_err("%s: failed to bring IOP into HOLD state\n", c->name);
+		return -EIO;
+	}
+
+	return i2o_hrt_get(c);
+};
+
+static void i2o_res_alloc(struct i2o_controller *c, unsigned long flags)
+{
+	i2o_status_block *sb = c->status_block.virt;
+	struct resource *res = &c->mem_resource;
+	resource_size_t size, align;
+	int err;
+
+	res->name = c->pdev->bus->name;
+	res->flags = flags;
+	res->start = 0;
+	res->end = 0;
+	osm_info("%s: requires private memory resources.\n", c->name);
+
+	if (flags & IORESOURCE_MEM) {
+		size = sb->desired_mem_size;
+		align = 1 << 20;	/* unspecified, use 1Mb and play safe */
+	} else {
+		size = sb->desired_io_size;
+		align = 1 << 12;	/* unspecified, use 4Kb and play safe */
+	}
+
+	err = pci_bus_alloc_resource(c->pdev->bus, res, size, align, 0, 0,
+				     NULL, NULL);
+	if (err < 0)
+		return;
+
+	if (flags & IORESOURCE_MEM) {
+		c->mem_alloc = 1;
+		sb->current_mem_size = resource_size(res);
+		sb->current_mem_base = res->start;
+	} else if (flags & IORESOURCE_IO) {
+		c->io_alloc = 1;
+		sb->current_io_size = resource_size(res);
+		sb->current_io_base = res->start;
+	}
+	osm_info("%s: allocated PCI space %pR\n", c->name, res);
+}
+
+/**
+ *	i2o_iop_systab_set - Set the I2O System Table of the specified IOP
+ *	@c: I2O controller to which the system table should be send
+ *
+ *	Before the systab could be set i2o_systab_build() must be called.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_iop_systab_set(struct i2o_controller *c)
+{
+	struct i2o_message *msg;
+	i2o_status_block *sb = c->status_block.virt;
+	struct device *dev = &c->pdev->dev;
+	int rc;
+
+	if (sb->current_mem_size < sb->desired_mem_size)
+		i2o_res_alloc(c, IORESOURCE_MEM);
+
+	if (sb->current_io_size < sb->desired_io_size)
+		i2o_res_alloc(c, IORESOURCE_IO);
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	i2o_systab.phys = dma_map_single(dev, i2o_systab.virt, i2o_systab.len,
+					 PCI_DMA_TODEVICE);
+	if (!i2o_systab.phys) {
+		i2o_msg_nop(c, msg);
+		return -ENOMEM;
+	}
+
+	msg->u.head[0] = cpu_to_le32(I2O_MESSAGE_SIZE(12) | SGL_OFFSET_6);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SYS_TAB_SET << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+
+	/*
+	 * Provide three SGL-elements:
+	 * System table (SysTab), Private memory space declaration and
+	 * Private i/o space declaration
+	 */
+
+	msg->body[0] = cpu_to_le32(c->unit + 2);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	msg->body[2] = cpu_to_le32(0x54000000 | i2o_systab.len);
+	msg->body[3] = cpu_to_le32(i2o_systab.phys);
+	msg->body[4] = cpu_to_le32(0x54000000 | sb->current_mem_size);
+	msg->body[5] = cpu_to_le32(sb->current_mem_base);
+	msg->body[6] = cpu_to_le32(0xd4000000 | sb->current_io_size);
+	msg->body[6] = cpu_to_le32(sb->current_io_base);
+
+	rc = i2o_msg_post_wait(c, msg, 120);
+
+	dma_unmap_single(dev, i2o_systab.phys, i2o_systab.len,
+			 PCI_DMA_TODEVICE);
+
+	if (rc < 0)
+		osm_err("%s: Unable to set SysTab (status=%#x).\n", c->name,
+			-rc);
+	else
+		osm_debug("%s: SysTab set.\n", c->name);
+
+	return rc;
+}
+
+/**
+ *	i2o_iop_online - Bring a controller online into OPERATIONAL state.
+ *	@c: I2O controller
+ *
+ *	Send the system table and enable the I2O controller.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_iop_online(struct i2o_controller *c)
+{
+	int rc;
+
+	rc = i2o_iop_systab_set(c);
+	if (rc)
+		return rc;
+
+	/* In READY state */
+	osm_debug("%s: Attempting to enable...\n", c->name);
+	rc = i2o_iop_enable(c);
+	if (rc)
+		return rc;
+
+	return 0;
+};
+
+/**
+ *	i2o_iop_remove - Remove the I2O controller from the I2O core
+ *	@c: I2O controller
+ *
+ *	Remove the I2O controller from the I2O core. If devices are attached to
+ *	the controller remove these also and finally reset the controller.
+ */
+void i2o_iop_remove(struct i2o_controller *c)
+{
+	struct i2o_device *dev, *tmp;
+
+	osm_debug("%s: deleting controller\n", c->name);
+
+	i2o_driver_notify_controller_remove_all(c);
+
+	list_del(&c->list);
+
+	list_for_each_entry_safe(dev, tmp, &c->devices, list)
+	    i2o_device_remove(dev);
+
+	device_del(&c->device);
+
+	/* Ask the IOP to switch to RESET state */
+	i2o_iop_reset(c);
+}
+
+/**
+ *	i2o_systab_build - Build system table
+ *
+ *	The system table contains information about all the IOPs in the system
+ *	(duh) and is used by the Executives on the IOPs to establish peer2peer
+ *	connections. We're not supporting peer2peer at the moment, but this
+ *	will be needed down the road for things like lan2lan forwarding.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_systab_build(void)
+{
+	struct i2o_controller *c, *tmp;
+	int num_controllers = 0;
+	u32 change_ind = 0;
+	int count = 0;
+	struct i2o_sys_tbl *systab = i2o_systab.virt;
+
+	list_for_each_entry_safe(c, tmp, &i2o_controllers, list)
+	    num_controllers++;
+
+	if (systab) {
+		change_ind = systab->change_ind;
+		kfree(i2o_systab.virt);
+	}
+
+	/* Header + IOPs */
+	i2o_systab.len = sizeof(struct i2o_sys_tbl) + num_controllers *
+	    sizeof(struct i2o_sys_tbl_entry);
+
+	systab = i2o_systab.virt = kzalloc(i2o_systab.len, GFP_KERNEL);
+	if (!systab) {
+		osm_err("unable to allocate memory for System Table\n");
+		return -ENOMEM;
+	}
+
+	systab->version = I2OVERSION;
+	systab->change_ind = change_ind + 1;
+
+	list_for_each_entry_safe(c, tmp, &i2o_controllers, list) {
+		i2o_status_block *sb;
+
+		if (count >= num_controllers) {
+			osm_err("controller added while building system table"
+				"\n");
+			break;
+		}
+
+		sb = c->status_block.virt;
+
+		/*
+		 * Get updated IOP state so we have the latest information
+		 *
+		 * We should delete the controller at this point if it
+		 * doesn't respond since if it's not on the system table
+		 * it is techninically not part of the I2O subsystem...
+		 */
+		if (unlikely(i2o_status_get(c))) {
+			osm_err("%s: Deleting b/c could not get status while "
+				"attempting to build system table\n", c->name);
+			i2o_iop_remove(c);
+			continue;	// try the next one
+		}
+
+		systab->iops[count].org_id = sb->org_id;
+		systab->iops[count].iop_id = c->unit + 2;
+		systab->iops[count].seg_num = 0;
+		systab->iops[count].i2o_version = sb->i2o_version;
+		systab->iops[count].iop_state = sb->iop_state;
+		systab->iops[count].msg_type = sb->msg_type;
+		systab->iops[count].frame_size = sb->inbound_frame_size;
+		systab->iops[count].last_changed = change_ind;
+		systab->iops[count].iop_capabilities = sb->iop_capabilities;
+		systab->iops[count].inbound_low =
+		    i2o_dma_low(c->base.phys + I2O_IN_PORT);
+		systab->iops[count].inbound_high =
+		    i2o_dma_high(c->base.phys + I2O_IN_PORT);
+
+		count++;
+	}
+
+	systab->num_entries = count;
+
+	return 0;
+};
+
+/**
+ *	i2o_parse_hrt - Parse the hardware resource table.
+ *	@c: I2O controller
+ *
+ *	We don't do anything with it except dumping it (in debug mode).
+ *
+ *	Returns 0.
+ */
+static int i2o_parse_hrt(struct i2o_controller *c)
+{
+	i2o_dump_hrt(c);
+	return 0;
+};
+
+/**
+ *	i2o_status_get - Get the status block from the I2O controller
+ *	@c: I2O controller
+ *
+ *	Issue a status query on the controller. This updates the attached
+ *	status block. The status block could then be accessed through
+ *	c->status_block.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_status_get(struct i2o_controller *c)
+{
+	struct i2o_message *msg;
+	volatile u8 *status_block;
+	unsigned long timeout;
+
+	status_block = (u8 *) c->status_block.virt;
+	memset(c->status_block.virt, 0, sizeof(i2o_status_block));
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_STATUS_GET << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(0x00000000);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	msg->body[2] = cpu_to_le32(i2o_dma_low(c->status_block.phys));
+	msg->body[3] = cpu_to_le32(i2o_dma_high(c->status_block.phys));
+	msg->body[4] = cpu_to_le32(sizeof(i2o_status_block));	/* always 88 bytes */
+
+	i2o_msg_post(c, msg);
+
+	/* Wait for a reply */
+	timeout = jiffies + I2O_TIMEOUT_STATUS_GET * HZ;
+	while (status_block[87] != 0xFF) {
+		if (time_after(jiffies, timeout)) {
+			osm_err("%s: Get status timeout.\n", c->name);
+			return -ETIMEDOUT;
+		}
+
+		schedule_timeout_uninterruptible(1);
+	}
+
+#ifdef DEBUG
+	i2o_debug_state(c);
+#endif
+
+	return 0;
+}
+
+/*
+ *	i2o_hrt_get - Get the Hardware Resource Table from the I2O controller
+ *	@c: I2O controller from which the HRT should be fetched
+ *
+ *	The HRT contains information about possible hidden devices but is
+ *	mostly useless to us.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_hrt_get(struct i2o_controller *c)
+{
+	int rc;
+	int i;
+	i2o_hrt *hrt = c->hrt.virt;
+	u32 size = sizeof(i2o_hrt);
+	struct device *dev = &c->pdev->dev;
+
+	for (i = 0; i < I2O_HRT_GET_TRIES; i++) {
+		struct i2o_message *msg;
+
+		msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+		if (IS_ERR(msg))
+			return PTR_ERR(msg);
+
+		msg->u.head[0] = cpu_to_le32(SIX_WORD_MSG_SIZE | SGL_OFFSET_4);
+		msg->u.head[1] =
+		    cpu_to_le32(I2O_CMD_HRT_GET << 24 | HOST_TID << 12 |
+				ADAPTER_TID);
+		msg->body[0] = cpu_to_le32(0xd0000000 | c->hrt.len);
+		msg->body[1] = cpu_to_le32(c->hrt.phys);
+
+		rc = i2o_msg_post_wait_mem(c, msg, 20, &c->hrt);
+
+		if (rc < 0) {
+			osm_err("%s: Unable to get HRT (status=%#x)\n", c->name,
+				-rc);
+			return rc;
+		}
+
+		size = hrt->num_entries * hrt->entry_len << 2;
+		if (size > c->hrt.len) {
+			if (i2o_dma_realloc(dev, &c->hrt, size))
+				return -ENOMEM;
+			else
+				hrt = c->hrt.virt;
+		} else
+			return i2o_parse_hrt(c);
+	}
+
+	osm_err("%s: Unable to get HRT after %d tries, giving up\n", c->name,
+		I2O_HRT_GET_TRIES);
+
+	return -EBUSY;
+}
+
+/**
+ *	i2o_iop_release - release the memory for a I2O controller
+ *	@dev: I2O controller which should be released
+ *
+ *	Release the allocated memory. This function is called if refcount of
+ *	device reaches 0 automatically.
+ */
+static void i2o_iop_release(struct device *dev)
+{
+	struct i2o_controller *c = to_i2o_controller(dev);
+
+	i2o_iop_free(c);
+};
+
+/**
+ *	i2o_iop_alloc - Allocate and initialize a i2o_controller struct
+ *
+ *	Allocate the necessary memory for a i2o_controller struct and
+ *	initialize the lists and message mempool.
+ *
+ *	Returns a pointer to the I2O controller or a negative error code on
+ *	failure.
+ */
+struct i2o_controller *i2o_iop_alloc(void)
+{
+	static int unit = 0;	/* 0 and 1 are NULL IOP and Local Host */
+	struct i2o_controller *c;
+	char poolname[32];
+
+	c = kzalloc(sizeof(*c), GFP_KERNEL);
+	if (!c) {
+		osm_err("i2o: Insufficient memory to allocate a I2O controller."
+			"\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	c->unit = unit++;
+	sprintf(c->name, "iop%d", c->unit);
+
+	snprintf(poolname, sizeof(poolname), "i2o_%s_msg_inpool", c->name);
+	if (i2o_pool_alloc
+	    (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4 + sizeof(u32),
+	     I2O_MSG_INPOOL_MIN)) {
+		kfree(c);
+		return ERR_PTR(-ENOMEM);
+	};
+
+	INIT_LIST_HEAD(&c->devices);
+	spin_lock_init(&c->lock);
+	mutex_init(&c->lct_lock);
+
+	device_initialize(&c->device);
+
+	c->device.release = &i2o_iop_release;
+
+	dev_set_name(&c->device, "iop%d", c->unit);
+
+#if BITS_PER_LONG == 64
+	spin_lock_init(&c->context_list_lock);
+	atomic_set(&c->context_list_counter, 0);
+	INIT_LIST_HEAD(&c->context_list);
+#endif
+
+	return c;
+};
+
+/**
+ *	i2o_iop_add - Initialize the I2O controller and add him to the I2O core
+ *	@c: controller
+ *
+ *	Initialize the I2O controller and if no error occurs add him to the I2O
+ *	core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_iop_add(struct i2o_controller *c)
+{
+	int rc;
+
+	if ((rc = device_add(&c->device))) {
+		osm_err("%s: could not add controller\n", c->name);
+		goto iop_reset;
+	}
+
+	osm_info("%s: Activating I2O controller...\n", c->name);
+	osm_info("%s: This may take a few minutes if there are many devices\n",
+		 c->name);
+
+	if ((rc = i2o_iop_activate(c))) {
+		osm_err("%s: could not activate controller\n", c->name);
+		goto device_del;
+	}
+
+	osm_debug("%s: building sys table...\n", c->name);
+
+	if ((rc = i2o_systab_build()))
+		goto device_del;
+
+	osm_debug("%s: online controller...\n", c->name);
+
+	if ((rc = i2o_iop_online(c)))
+		goto device_del;
+
+	osm_debug("%s: getting LCT...\n", c->name);
+
+	if ((rc = i2o_exec_lct_get(c)))
+		goto device_del;
+
+	list_add(&c->list, &i2o_controllers);
+
+	i2o_driver_notify_controller_add_all(c);
+
+	osm_info("%s: Controller added\n", c->name);
+
+	return 0;
+
+      device_del:
+	device_del(&c->device);
+
+      iop_reset:
+	i2o_iop_reset(c);
+
+	return rc;
+};
+
+/**
+ *	i2o_event_register - Turn on/off event notification for a I2O device
+ *	@dev: I2O device which should receive the event registration request
+ *	@drv: driver which want to get notified
+ *	@tcntxt: transaction context to use with this notifier
+ *	@evt_mask: mask of events
+ *
+ *	Create and posts an event registration message to the task. No reply
+ *	is waited for, or expected. If you do not want further notifications,
+ *	call the i2o_event_register again with a evt_mask of 0.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_event_register(struct i2o_device *dev, struct i2o_driver *drv,
+		       int tcntxt, u32 evt_mask)
+{
+	struct i2o_controller *c = dev->iop;
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->u.s.icntxt = cpu_to_le32(drv->context);
+	msg->u.s.tcntxt = cpu_to_le32(tcntxt);
+	msg->body[0] = cpu_to_le32(evt_mask);
+
+	i2o_msg_post(c, msg);
+
+	return 0;
+};
+
+/**
+ *	i2o_iop_init - I2O main initialization function
+ *
+ *	Initialize the I2O drivers (OSM) functions, register the Executive OSM,
+ *	initialize the I2O PCI part and finally initialize I2O device stuff.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_iop_init(void)
+{
+	int rc = 0;
+
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	if ((rc = i2o_driver_init()))
+		goto exit;
+
+	if ((rc = i2o_exec_init()))
+		goto driver_exit;
+
+	if ((rc = i2o_pci_init()))
+		goto exec_exit;
+
+	return 0;
+
+      exec_exit:
+	i2o_exec_exit();
+
+      driver_exit:
+	i2o_driver_exit();
+
+      exit:
+	return rc;
+}
+
+/**
+ *	i2o_iop_exit - I2O main exit function
+ *
+ *	Removes I2O controllers from PCI subsystem and shut down OSMs.
+ */
+static void __exit i2o_iop_exit(void)
+{
+	i2o_pci_exit();
+	i2o_exec_exit();
+	i2o_driver_exit();
+};
+
+module_init(i2o_iop_init);
+module_exit(i2o_iop_exit);
+
+MODULE_AUTHOR("Red Hat Software");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+#if BITS_PER_LONG == 64
+EXPORT_SYMBOL(i2o_cntxt_list_add);
+EXPORT_SYMBOL(i2o_cntxt_list_get);
+EXPORT_SYMBOL(i2o_cntxt_list_remove);
+EXPORT_SYMBOL(i2o_cntxt_list_get_ptr);
+#endif
+EXPORT_SYMBOL(i2o_msg_get_wait);
+EXPORT_SYMBOL(i2o_find_iop);
+EXPORT_SYMBOL(i2o_iop_find_device);
+EXPORT_SYMBOL(i2o_event_register);
+EXPORT_SYMBOL(i2o_status_get);
+EXPORT_SYMBOL(i2o_controllers);
diff --git a/drivers/staging/i2o/memory.c b/drivers/staging/i2o/memory.c
new file mode 100644
index 000000000000..8f9509d275a4
--- /dev/null
+++ b/drivers/staging/i2o/memory.c
@@ -0,0 +1,313 @@
+/*
+ *	Functions to handle I2O memory
+ *
+ *	Pulled from the inlines in i2o headers and uninlined
+ *
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ */
+
+#include <linux/module.h>
+#include "i2o.h"
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "core.h"
+
+/* Protects our 32/64bit mask switching */
+static DEFINE_MUTEX(mem_lock);
+
+/**
+ *	i2o_sg_tablesize - Calculate the maximum number of elements in a SGL
+ *	@c: I2O controller for which the calculation should be done
+ *	@body_size: maximum body size used for message in 32-bit words.
+ *
+ *	Return the maximum number of SG elements in a SG list.
+ */
+u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size)
+{
+	i2o_status_block *sb = c->status_block.virt;
+	u16 sg_count =
+	    (sb->inbound_frame_size - sizeof(struct i2o_message) / 4) -
+	    body_size;
+
+	if (c->pae_support) {
+		/*
+		 * for 64-bit a SG attribute element must be added and each
+		 * SG element needs 12 bytes instead of 8.
+		 */
+		sg_count -= 2;
+		sg_count /= 3;
+	} else
+		sg_count /= 2;
+
+	if (c->short_req && (sg_count > 8))
+		sg_count = 8;
+
+	return sg_count;
+}
+EXPORT_SYMBOL_GPL(i2o_sg_tablesize);
+
+
+/**
+ *	i2o_dma_map_single - Map pointer to controller and fill in I2O message.
+ *	@c: I2O controller
+ *	@ptr: pointer to the data which should be mapped
+ *	@size: size of data in bytes
+ *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
+ *	@sg_ptr: pointer to the SG list inside the I2O message
+ *
+ *	This function does all necessary DMA handling and also writes the I2O
+ *	SGL elements into the I2O message. For details on DMA handling see also
+ *	dma_map_single(). The pointer sg_ptr will only be set to the end of the
+ *	SG list if the allocation was successful.
+ *
+ *	Returns DMA address which must be checked for failures using
+ *	dma_mapping_error().
+ */
+dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
+					    size_t size,
+					    enum dma_data_direction direction,
+					    u32 ** sg_ptr)
+{
+	u32 sg_flags;
+	u32 *mptr = *sg_ptr;
+	dma_addr_t dma_addr;
+
+	switch (direction) {
+	case DMA_TO_DEVICE:
+		sg_flags = 0xd4000000;
+		break;
+	case DMA_FROM_DEVICE:
+		sg_flags = 0xd0000000;
+		break;
+	default:
+		return 0;
+	}
+
+	dma_addr = dma_map_single(&c->pdev->dev, ptr, size, direction);
+	if (!dma_mapping_error(&c->pdev->dev, dma_addr)) {
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
+			*mptr++ = cpu_to_le32(0x7C020002);
+			*mptr++ = cpu_to_le32(PAGE_SIZE);
+		}
+#endif
+
+		*mptr++ = cpu_to_le32(sg_flags | size);
+		*mptr++ = cpu_to_le32(i2o_dma_low(dma_addr));
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
+			*mptr++ = cpu_to_le32(i2o_dma_high(dma_addr));
+#endif
+		*sg_ptr = mptr;
+	}
+	return dma_addr;
+}
+EXPORT_SYMBOL_GPL(i2o_dma_map_single);
+
+/**
+ *	i2o_dma_map_sg - Map a SG List to controller and fill in I2O message.
+ *	@c: I2O controller
+ *	@sg: SG list to be mapped
+ *	@sg_count: number of elements in the SG list
+ *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
+ *	@sg_ptr: pointer to the SG list inside the I2O message
+ *
+ *	This function does all necessary DMA handling and also writes the I2O
+ *	SGL elements into the I2O message. For details on DMA handling see also
+ *	dma_map_sg(). The pointer sg_ptr will only be set to the end of the SG
+ *	list if the allocation was successful.
+ *
+ *	Returns 0 on failure or 1 on success.
+ */
+int i2o_dma_map_sg(struct i2o_controller *c, struct scatterlist *sg,
+	    int sg_count, enum dma_data_direction direction, u32 ** sg_ptr)
+{
+	u32 sg_flags;
+	u32 *mptr = *sg_ptr;
+
+	switch (direction) {
+	case DMA_TO_DEVICE:
+		sg_flags = 0x14000000;
+		break;
+	case DMA_FROM_DEVICE:
+		sg_flags = 0x10000000;
+		break;
+	default:
+		return 0;
+	}
+
+	sg_count = dma_map_sg(&c->pdev->dev, sg, sg_count, direction);
+	if (!sg_count)
+		return 0;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+	if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
+		*mptr++ = cpu_to_le32(0x7C020002);
+		*mptr++ = cpu_to_le32(PAGE_SIZE);
+	}
+#endif
+
+	while (sg_count-- > 0) {
+		if (!sg_count)
+			sg_flags |= 0xC0000000;
+		*mptr++ = cpu_to_le32(sg_flags | sg_dma_len(sg));
+		*mptr++ = cpu_to_le32(i2o_dma_low(sg_dma_address(sg)));
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
+			*mptr++ = cpu_to_le32(i2o_dma_high(sg_dma_address(sg)));
+#endif
+		sg = sg_next(sg);
+	}
+	*sg_ptr = mptr;
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(i2o_dma_map_sg);
+
+/**
+ *	i2o_dma_alloc - Allocate DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: i2o_dma struct which should get the DMA buffer
+ *	@len: length of the new DMA memory
+ *
+ *	Allocate a coherent DMA memory and write the pointers into addr.
+ *
+ *	Returns 0 on success or -ENOMEM on failure.
+ */
+int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr, size_t len)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int dma_64 = 0;
+
+	mutex_lock(&mem_lock);
+	if ((sizeof(dma_addr_t) > 4) && (pdev->dma_mask == DMA_BIT_MASK(64))) {
+		dma_64 = 1;
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
+			mutex_unlock(&mem_lock);
+			return -ENOMEM;
+		}
+	}
+
+	addr->virt = dma_alloc_coherent(dev, len, &addr->phys, GFP_KERNEL);
+
+	if ((sizeof(dma_addr_t) > 4) && dma_64)
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)))
+			printk(KERN_WARNING "i2o: unable to set 64-bit DMA");
+	mutex_unlock(&mem_lock);
+
+	if (!addr->virt)
+		return -ENOMEM;
+
+	memset(addr->virt, 0, len);
+	addr->len = len;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(i2o_dma_alloc);
+
+
+/**
+ *	i2o_dma_free - Free DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: i2o_dma struct which contains the DMA buffer
+ *
+ *	Free a coherent DMA memory and set virtual address of addr to NULL.
+ */
+void i2o_dma_free(struct device *dev, struct i2o_dma *addr)
+{
+	if (addr->virt) {
+		if (addr->phys)
+			dma_free_coherent(dev, addr->len, addr->virt,
+					  addr->phys);
+		else
+			kfree(addr->virt);
+		addr->virt = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(i2o_dma_free);
+
+
+/**
+ *	i2o_dma_realloc - Realloc DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: pointer to a i2o_dma struct DMA buffer
+ *	@len: new length of memory
+ *
+ *	If there was something allocated in the addr, free it first. If len > 0
+ *	than try to allocate it and write the addresses back to the addr
+ *	structure. If len == 0 set the virtual address to NULL.
+ *
+ *	Returns the 0 on success or negative error code on failure.
+ */
+int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr, size_t len)
+{
+	i2o_dma_free(dev, addr);
+
+	if (len)
+		return i2o_dma_alloc(dev, addr, len);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(i2o_dma_realloc);
+
+/*
+ *	i2o_pool_alloc - Allocate an slab cache and mempool
+ *	@mempool: pointer to struct i2o_pool to write data into.
+ *	@name: name which is used to identify cache
+ *	@size: size of each object
+ *	@min_nr: minimum number of objects
+ *
+ *	First allocates a slab cache with name and size. Then allocates a
+ *	mempool which uses the slab cache for allocation and freeing.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
+				 size_t size, int min_nr)
+{
+	pool->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
+	if (!pool->name)
+		goto exit;
+	strcpy(pool->name, name);
+
+	pool->slab =
+	    kmem_cache_create(pool->name, size, 0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!pool->slab)
+		goto free_name;
+
+	pool->mempool = mempool_create_slab_pool(min_nr, pool->slab);
+	if (!pool->mempool)
+		goto free_slab;
+
+	return 0;
+
+free_slab:
+	kmem_cache_destroy(pool->slab);
+
+free_name:
+	kfree(pool->name);
+
+exit:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(i2o_pool_alloc);
+
+/*
+ *	i2o_pool_free - Free slab cache and mempool again
+ *	@mempool: pointer to struct i2o_pool which should be freed
+ *
+ *	Note that you have to return all objects to the mempool again before
+ *	calling i2o_pool_free().
+ */
+void i2o_pool_free(struct i2o_pool *pool)
+{
+	mempool_destroy(pool->mempool);
+	kmem_cache_destroy(pool->slab);
+	kfree(pool->name);
+};
+EXPORT_SYMBOL_GPL(i2o_pool_free);
diff --git a/drivers/staging/i2o/pci.c b/drivers/staging/i2o/pci.c
new file mode 100644
index 000000000000..b3b8a61dd4a6
--- /dev/null
+++ b/drivers/staging/i2o/pci.c
@@ -0,0 +1,497 @@
+/*
+ *	PCI handling of I2O controller
+ *
+ * 	Copyright (C) 1999-2002	Red Hat Software
+ *
+ *	Written by Alan Cox, Building Number Three Ltd
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	A lot of the I2O message side code from this is taken from the Red
+ *	Creek RCPCI45 adapter driver by Red Creek Communications
+ *
+ *	Fixes/additions:
+ *		Philipp Rumpf
+ *		Juha Sievänen <Juha.Sievanen@cs.Helsinki.FI>
+ *		Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
+ *		Deepak Saxena <deepak@plexity.net>
+ *		Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>:
+ *			Ported to Linux 2.5.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Minor fixes for 2.6.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Support for sysfs included.
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include "i2o.h"
+#include <linux/module.h>
+#include "core.h"
+
+#define OSM_DESCRIPTION	"I2O-subsystem"
+
+/* PCI device id table for all I2O controllers */
+static struct pci_device_id i2o_pci_ids[] = {
+	{PCI_DEVICE_CLASS(PCI_CLASS_INTELLIGENT_I2O << 8, 0xffff00)},
+	{PCI_DEVICE(PCI_VENDOR_ID_DPT, 0xa511)},
+	{.vendor = PCI_VENDOR_ID_INTEL,.device = 0x1962,
+	 .subvendor = PCI_VENDOR_ID_PROMISE,.subdevice = PCI_ANY_ID},
+	{0}
+};
+
+/**
+ *	i2o_pci_free - Frees the DMA memory for the I2O controller
+ *	@c: I2O controller to free
+ *
+ *	Remove all allocated DMA memory and unmap memory IO regions. If MTRR
+ *	is enabled, also remove it again.
+ */
+static void i2o_pci_free(struct i2o_controller *c)
+{
+	struct device *dev;
+
+	dev = &c->pdev->dev;
+
+	i2o_dma_free(dev, &c->out_queue);
+	i2o_dma_free(dev, &c->status_block);
+	kfree(c->lct);
+	i2o_dma_free(dev, &c->dlct);
+	i2o_dma_free(dev, &c->hrt);
+	i2o_dma_free(dev, &c->status);
+
+	if (c->raptor && c->in_queue.virt)
+		iounmap(c->in_queue.virt);
+
+	if (c->base.virt)
+		iounmap(c->base.virt);
+
+	pci_release_regions(c->pdev);
+}
+
+/**
+ *	i2o_pci_alloc - Allocate DMA memory, map IO memory for I2O controller
+ *	@c: I2O controller
+ *
+ *	Allocate DMA memory for a PCI (or in theory AGP) I2O controller. All
+ *	IO mappings are also done here. If MTRR is enabled, also do add memory
+ *	regions here.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_pci_alloc(struct i2o_controller *c)
+{
+	struct pci_dev *pdev = c->pdev;
+	struct device *dev = &pdev->dev;
+	int i;
+
+	if (pci_request_regions(pdev, OSM_DESCRIPTION)) {
+		printk(KERN_ERR "%s: device already claimed\n", c->name);
+		return -ENODEV;
+	}
+
+	for (i = 0; i < 6; i++) {
+		/* Skip I/O spaces */
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_IO)) {
+			if (!c->base.phys) {
+				c->base.phys = pci_resource_start(pdev, i);
+				c->base.len = pci_resource_len(pdev, i);
+
+				/*
+				 * If we know what card it is, set the size
+				 * correctly. Code is taken from dpt_i2o.c
+				 */
+				if (pdev->device == 0xa501) {
+					if (pdev->subsystem_device >= 0xc032 &&
+					    pdev->subsystem_device <= 0xc03b) {
+						if (c->base.len > 0x400000)
+							c->base.len = 0x400000;
+					} else {
+						if (c->base.len > 0x100000)
+							c->base.len = 0x100000;
+					}
+				}
+				if (!c->raptor)
+					break;
+			} else {
+				c->in_queue.phys = pci_resource_start(pdev, i);
+				c->in_queue.len = pci_resource_len(pdev, i);
+				break;
+			}
+		}
+	}
+
+	if (i == 6) {
+		printk(KERN_ERR "%s: I2O controller has no memory regions"
+		       " defined.\n", c->name);
+		i2o_pci_free(c);
+		return -EINVAL;
+	}
+
+	/* Map the I2O controller */
+	if (c->raptor) {
+		printk(KERN_INFO "%s: PCI I2O controller\n", c->name);
+		printk(KERN_INFO "     BAR0 at 0x%08lX size=%ld\n",
+		       (unsigned long)c->base.phys, (unsigned long)c->base.len);
+		printk(KERN_INFO "     BAR1 at 0x%08lX size=%ld\n",
+		       (unsigned long)c->in_queue.phys,
+		       (unsigned long)c->in_queue.len);
+	} else
+		printk(KERN_INFO "%s: PCI I2O controller at %08lX size=%ld\n",
+		       c->name, (unsigned long)c->base.phys,
+		       (unsigned long)c->base.len);
+
+	c->base.virt = ioremap_nocache(c->base.phys, c->base.len);
+	if (!c->base.virt) {
+		printk(KERN_ERR "%s: Unable to map controller.\n", c->name);
+		i2o_pci_free(c);
+		return -ENOMEM;
+	}
+
+	if (c->raptor) {
+		c->in_queue.virt =
+		    ioremap_nocache(c->in_queue.phys, c->in_queue.len);
+		if (!c->in_queue.virt) {
+			printk(KERN_ERR "%s: Unable to map controller.\n",
+			       c->name);
+			i2o_pci_free(c);
+			return -ENOMEM;
+		}
+	} else
+		c->in_queue = c->base;
+
+	c->irq_status = c->base.virt + I2O_IRQ_STATUS;
+	c->irq_mask = c->base.virt + I2O_IRQ_MASK;
+	c->in_port = c->base.virt + I2O_IN_PORT;
+	c->out_port = c->base.virt + I2O_OUT_PORT;
+
+	/* Motorola/Freescale chip does not follow spec */
+	if (pdev->vendor == PCI_VENDOR_ID_MOTOROLA && pdev->device == 0x18c0) {
+		/* Check if CPU is enabled */
+		if (be32_to_cpu(readl(c->base.virt + 0x10000)) & 0x10000000) {
+			printk(KERN_INFO "%s: MPC82XX needs CPU running to "
+			       "service I2O.\n", c->name);
+			i2o_pci_free(c);
+			return -ENODEV;
+		} else {
+			c->irq_status += I2O_MOTOROLA_PORT_OFFSET;
+			c->irq_mask += I2O_MOTOROLA_PORT_OFFSET;
+			c->in_port += I2O_MOTOROLA_PORT_OFFSET;
+			c->out_port += I2O_MOTOROLA_PORT_OFFSET;
+			printk(KERN_INFO "%s: MPC82XX workarounds activated.\n",
+			       c->name);
+		}
+	}
+
+	if (i2o_dma_alloc(dev, &c->status, 8)) {
+		i2o_pci_free(c);
+		return -ENOMEM;
+	}
+
+	if (i2o_dma_alloc(dev, &c->hrt, sizeof(i2o_hrt))) {
+		i2o_pci_free(c);
+		return -ENOMEM;
+	}
+
+	if (i2o_dma_alloc(dev, &c->dlct, 8192)) {
+		i2o_pci_free(c);
+		return -ENOMEM;
+	}
+
+	if (i2o_dma_alloc(dev, &c->status_block, sizeof(i2o_status_block))) {
+		i2o_pci_free(c);
+		return -ENOMEM;
+	}
+
+	if (i2o_dma_alloc(dev, &c->out_queue,
+		I2O_MAX_OUTBOUND_MSG_FRAMES * I2O_OUTBOUND_MSG_FRAME_SIZE *
+				sizeof(u32))) {
+		i2o_pci_free(c);
+		return -ENOMEM;
+	}
+
+	pci_set_drvdata(pdev, c);
+
+	return 0;
+}
+
+/**
+ *	i2o_pci_interrupt - Interrupt handler for I2O controller
+ *	@irq: interrupt line
+ *	@dev_id: pointer to the I2O controller
+ *
+ *	Handle an interrupt from a PCI based I2O controller. This turns out
+ *	to be rather simple. We keep the controller pointer in the cookie.
+ */
+static irqreturn_t i2o_pci_interrupt(int irq, void *dev_id)
+{
+	struct i2o_controller *c = dev_id;
+	u32 m;
+	irqreturn_t rc = IRQ_NONE;
+
+	while (readl(c->irq_status) & I2O_IRQ_OUTBOUND_POST) {
+		m = readl(c->out_port);
+		if (m == I2O_QUEUE_EMPTY) {
+			/*
+			 * Old 960 steppings had a bug in the I2O unit that
+			 * caused the queue to appear empty when it wasn't.
+			 */
+			m = readl(c->out_port);
+			if (unlikely(m == I2O_QUEUE_EMPTY))
+				break;
+		}
+
+		/* dispatch it */
+		if (i2o_driver_dispatch(c, m))
+			/* flush it if result != 0 */
+			i2o_flush_reply(c, m);
+
+		rc = IRQ_HANDLED;
+	}
+
+	return rc;
+}
+
+/**
+ *	i2o_pci_irq_enable - Allocate interrupt for I2O controller
+ *	@c: i2o_controller that the request is for
+ *
+ *	Allocate an interrupt for the I2O controller, and activate interrupts
+ *	on the I2O controller.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_pci_irq_enable(struct i2o_controller *c)
+{
+	struct pci_dev *pdev = c->pdev;
+	int rc;
+
+	writel(0xffffffff, c->irq_mask);
+
+	if (pdev->irq) {
+		rc = request_irq(pdev->irq, i2o_pci_interrupt, IRQF_SHARED,
+				 c->name, c);
+		if (rc < 0) {
+			printk(KERN_ERR "%s: unable to allocate interrupt %d."
+			       "\n", c->name, pdev->irq);
+			return rc;
+		}
+	}
+
+	writel(0x00000000, c->irq_mask);
+
+	printk(KERN_INFO "%s: Installed at IRQ %d\n", c->name, pdev->irq);
+
+	return 0;
+}
+
+/**
+ *	i2o_pci_irq_disable - Free interrupt for I2O controller
+ *	@c: I2O controller
+ *
+ *	Disable interrupts in I2O controller and then free interrupt.
+ */
+static void i2o_pci_irq_disable(struct i2o_controller *c)
+{
+	writel(0xffffffff, c->irq_mask);
+
+	if (c->pdev->irq > 0)
+		free_irq(c->pdev->irq, c);
+}
+
+/**
+ *	i2o_pci_probe - Probe the PCI device for an I2O controller
+ *	@pdev: PCI device to test
+ *	@id: id which matched with the PCI device id table
+ *
+ *	Probe the PCI device for any device which is a memory of the
+ *	Intelligent, I2O class or an Adaptec Zero Channel Controller. We
+ *	attempt to set up each such device and register it with the core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct i2o_controller *c;
+	int rc;
+	struct pci_dev *i960 = NULL;
+
+	printk(KERN_INFO "i2o: Checking for PCI I2O controllers...\n");
+
+	if ((pdev->class & 0xff) > 1) {
+		printk(KERN_WARNING "i2o: %s does not support I2O 1.5 "
+		       "(skipping).\n", pci_name(pdev));
+		return -ENODEV;
+	}
+
+	if ((rc = pci_enable_device(pdev))) {
+		printk(KERN_WARNING "i2o: couldn't enable device %s\n",
+		       pci_name(pdev));
+		return rc;
+	}
+
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
+		printk(KERN_WARNING "i2o: no suitable DMA found for %s\n",
+		       pci_name(pdev));
+		rc = -ENODEV;
+		goto disable;
+	}
+
+	pci_set_master(pdev);
+
+	c = i2o_iop_alloc();
+	if (IS_ERR(c)) {
+		printk(KERN_ERR "i2o: couldn't allocate memory for %s\n",
+		       pci_name(pdev));
+		rc = PTR_ERR(c);
+		goto disable;
+	} else
+		printk(KERN_INFO "%s: controller found (%s)\n", c->name,
+		       pci_name(pdev));
+
+	c->pdev = pdev;
+	c->device.parent = &pdev->dev;
+
+	/* Cards that fall apart if you hit them with large I/O loads... */
+	if (pdev->vendor == PCI_VENDOR_ID_NCR && pdev->device == 0x0630) {
+		c->short_req = 1;
+		printk(KERN_INFO "%s: Symbios FC920 workarounds activated.\n",
+		       c->name);
+	}
+
+	if (pdev->subsystem_vendor == PCI_VENDOR_ID_PROMISE) {
+		/*
+		 * Expose the ship behind i960 for initialization, or it will
+		 * failed
+		 */
+		i960 = pci_get_slot(c->pdev->bus,
+				  PCI_DEVFN(PCI_SLOT(c->pdev->devfn), 0));
+
+		if (i960) {
+			pci_write_config_word(i960, 0x42, 0);
+			pci_dev_put(i960);
+		}
+
+		c->promise = 1;
+		c->limit_sectors = 1;
+	}
+
+	if (pdev->subsystem_vendor == PCI_VENDOR_ID_DPT)
+		c->adaptec = 1;
+
+	/* Cards that go bananas if you quiesce them before you reset them. */
+	if (pdev->vendor == PCI_VENDOR_ID_DPT) {
+		c->no_quiesce = 1;
+		if (pdev->device == 0xa511)
+			c->raptor = 1;
+
+		if (pdev->subsystem_device == 0xc05a) {
+			c->limit_sectors = 1;
+			printk(KERN_INFO
+			       "%s: limit sectors per request to %d\n", c->name,
+			       I2O_MAX_SECTORS_LIMITED);
+		}
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if (sizeof(dma_addr_t) > 4) {
+			if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)))
+				printk(KERN_INFO "%s: 64-bit DMA unavailable\n",
+				       c->name);
+			else {
+				c->pae_support = 1;
+				printk(KERN_INFO "%s: using 64-bit DMA\n",
+				       c->name);
+			}
+		}
+#endif
+	}
+
+	if ((rc = i2o_pci_alloc(c))) {
+		printk(KERN_ERR "%s: DMA / IO allocation for I2O controller "
+		       "failed\n", c->name);
+		goto free_controller;
+	}
+
+	if (i2o_pci_irq_enable(c)) {
+		printk(KERN_ERR "%s: unable to enable interrupts for I2O "
+		       "controller\n", c->name);
+		goto free_pci;
+	}
+
+	if ((rc = i2o_iop_add(c)))
+		goto uninstall;
+
+	if (i960)
+		pci_write_config_word(i960, 0x42, 0x03ff);
+
+	return 0;
+
+      uninstall:
+	i2o_pci_irq_disable(c);
+
+      free_pci:
+	i2o_pci_free(c);
+
+      free_controller:
+	i2o_iop_free(c);
+
+      disable:
+	pci_disable_device(pdev);
+
+	return rc;
+}
+
+/**
+ *	i2o_pci_remove - Removes a I2O controller from the system
+ *	@pdev: I2O controller which should be removed
+ *
+ *	Reset the I2O controller, disable interrupts and remove all allocated
+ *	resources.
+ */
+static void i2o_pci_remove(struct pci_dev *pdev)
+{
+	struct i2o_controller *c;
+	c = pci_get_drvdata(pdev);
+
+	i2o_iop_remove(c);
+	i2o_pci_irq_disable(c);
+	i2o_pci_free(c);
+
+	pci_disable_device(pdev);
+
+	printk(KERN_INFO "%s: Controller removed.\n", c->name);
+
+	put_device(&c->device);
+};
+
+/* PCI driver for I2O controller */
+static struct pci_driver i2o_pci_driver = {
+	.name = "PCI_I2O",
+	.id_table = i2o_pci_ids,
+	.probe = i2o_pci_probe,
+	.remove = i2o_pci_remove,
+};
+
+/**
+ *	i2o_pci_init - registers I2O PCI driver in PCI subsystem
+ *
+ *	Returns > 0 on success or negative error code on failure.
+ */
+int __init i2o_pci_init(void)
+{
+	return pci_register_driver(&i2o_pci_driver);
+};
+
+/**
+ *	i2o_pci_exit - unregisters I2O PCI driver from PCI subsystem
+ */
+void __exit i2o_pci_exit(void)
+{
+	pci_unregister_driver(&i2o_pci_driver);
+};
+
+MODULE_DEVICE_TABLE(pci, i2o_pci_ids);
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
deleted file mode 100644
index d23c3c20b201..000000000000
--- a/include/linux/i2o.h
+++ /dev/null
@@ -1,988 +0,0 @@
-/*
- * I2O kernel space accessible structures/APIs
- *
- * (c) Copyright 1999, 2000 Red Hat Software
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- *************************************************************************
- *
- * This header file defined the I2O APIs/structures for use by
- * the I2O kernel modules.
- *
- */
-
-#ifndef _I2O_H
-#define _I2O_H
-
-#include <linux/i2o-dev.h>
-
-/* How many different OSM's are we allowing */
-#define I2O_MAX_DRIVERS		8
-
-#include <linux/pci.h>
-#include <linux/bug.h>
-#include <linux/dma-mapping.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>	/* work_struct */
-#include <linux/mempool.h>
-#include <linux/mutex.h>
-#include <linux/scatterlist.h>
-#include <linux/semaphore.h>	/* Needed for MUTEX init macros */
-
-#include <asm/io.h>
-
-/* message queue empty */
-#define I2O_QUEUE_EMPTY		0xffffffff
-
-/*
- *	Cache strategies
- */
-
-/*	The NULL strategy leaves everything up to the controller. This tends to be a
- *	pessimal but functional choice.
- */
-#define CACHE_NULL		0
-/*	Prefetch data when reading. We continually attempt to load the next 32 sectors
- *	into the controller cache.
- */
-#define CACHE_PREFETCH		1
-/*	Prefetch data when reading. We sometimes attempt to load the next 32 sectors
- *	into the controller cache. When an I/O is less <= 8K we assume its probably
- *	not sequential and don't prefetch (default)
- */
-#define CACHE_SMARTFETCH	2
-/*	Data is written to the cache and then out on to the disk. The I/O must be
- *	physically on the medium before the write is acknowledged (default without
- *	NVRAM)
- */
-#define CACHE_WRITETHROUGH	17
-/*	Data is written to the cache and then out on to the disk. The controller
- *	is permitted to write back the cache any way it wants. (default if battery
- *	backed NVRAM is present). It can be useful to set this for swap regardless of
- *	battery state.
- */
-#define CACHE_WRITEBACK		18
-/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
- *	write large I/O's directly to disk bypassing the cache to avoid the extra
- *	memory copy hits. Small writes are writeback cached
- */
-#define CACHE_SMARTBACK		19
-/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
- *	write large I/O's directly to disk bypassing the cache to avoid the extra
- *	memory copy hits. Small writes are writethrough cached. Suitable for devices
- *	lacking battery backup
- */
-#define CACHE_SMARTTHROUGH	20
-
-/*
- *	Ioctl structures
- */
-
-#define 	BLKI2OGRSTRAT	_IOR('2', 1, int)
-#define 	BLKI2OGWSTRAT	_IOR('2', 2, int)
-#define 	BLKI2OSRSTRAT	_IOW('2', 3, int)
-#define 	BLKI2OSWSTRAT	_IOW('2', 4, int)
-
-/*
- *	I2O Function codes
- */
-
-/*
- *	Executive Class
- */
-#define	I2O_CMD_ADAPTER_ASSIGN		0xB3
-#define	I2O_CMD_ADAPTER_READ		0xB2
-#define	I2O_CMD_ADAPTER_RELEASE		0xB5
-#define	I2O_CMD_BIOS_INFO_SET		0xA5
-#define	I2O_CMD_BOOT_DEVICE_SET		0xA7
-#define	I2O_CMD_CONFIG_VALIDATE		0xBB
-#define	I2O_CMD_CONN_SETUP		0xCA
-#define	I2O_CMD_DDM_DESTROY		0xB1
-#define	I2O_CMD_DDM_ENABLE		0xD5
-#define	I2O_CMD_DDM_QUIESCE		0xC7
-#define	I2O_CMD_DDM_RESET		0xD9
-#define	I2O_CMD_DDM_SUSPEND		0xAF
-#define	I2O_CMD_DEVICE_ASSIGN		0xB7
-#define	I2O_CMD_DEVICE_RELEASE		0xB9
-#define	I2O_CMD_HRT_GET			0xA8
-#define	I2O_CMD_ADAPTER_CLEAR		0xBE
-#define	I2O_CMD_ADAPTER_CONNECT		0xC9
-#define	I2O_CMD_ADAPTER_RESET		0xBD
-#define	I2O_CMD_LCT_NOTIFY		0xA2
-#define	I2O_CMD_OUTBOUND_INIT		0xA1
-#define	I2O_CMD_PATH_ENABLE		0xD3
-#define	I2O_CMD_PATH_QUIESCE		0xC5
-#define	I2O_CMD_PATH_RESET		0xD7
-#define	I2O_CMD_STATIC_MF_CREATE	0xDD
-#define	I2O_CMD_STATIC_MF_RELEASE	0xDF
-#define	I2O_CMD_STATUS_GET		0xA0
-#define	I2O_CMD_SW_DOWNLOAD		0xA9
-#define	I2O_CMD_SW_UPLOAD		0xAB
-#define	I2O_CMD_SW_REMOVE		0xAD
-#define	I2O_CMD_SYS_ENABLE		0xD1
-#define	I2O_CMD_SYS_MODIFY		0xC1
-#define	I2O_CMD_SYS_QUIESCE		0xC3
-#define	I2O_CMD_SYS_TAB_SET		0xA3
-
-/*
- * Utility Class
- */
-#define I2O_CMD_UTIL_NOP		0x00
-#define I2O_CMD_UTIL_ABORT		0x01
-#define I2O_CMD_UTIL_CLAIM		0x09
-#define I2O_CMD_UTIL_RELEASE		0x0B
-#define I2O_CMD_UTIL_PARAMS_GET		0x06
-#define I2O_CMD_UTIL_PARAMS_SET		0x05
-#define I2O_CMD_UTIL_EVT_REGISTER	0x13
-#define I2O_CMD_UTIL_EVT_ACK		0x14
-#define I2O_CMD_UTIL_CONFIG_DIALOG	0x10
-#define I2O_CMD_UTIL_DEVICE_RESERVE	0x0D
-#define I2O_CMD_UTIL_DEVICE_RELEASE	0x0F
-#define I2O_CMD_UTIL_LOCK		0x17
-#define I2O_CMD_UTIL_LOCK_RELEASE	0x19
-#define I2O_CMD_UTIL_REPLY_FAULT_NOTIFY	0x15
-
-/*
- * SCSI Host Bus Adapter Class
- */
-#define I2O_CMD_SCSI_EXEC		0x81
-#define I2O_CMD_SCSI_ABORT		0x83
-#define I2O_CMD_SCSI_BUSRESET		0x27
-
-/*
- * Bus Adapter Class
- */
-#define I2O_CMD_BUS_ADAPTER_RESET	0x85
-#define I2O_CMD_BUS_RESET		0x87
-#define I2O_CMD_BUS_SCAN		0x89
-#define I2O_CMD_BUS_QUIESCE		0x8b
-
-/*
- * Random Block Storage Class
- */
-#define I2O_CMD_BLOCK_READ		0x30
-#define I2O_CMD_BLOCK_WRITE		0x31
-#define I2O_CMD_BLOCK_CFLUSH		0x37
-#define I2O_CMD_BLOCK_MLOCK		0x49
-#define I2O_CMD_BLOCK_MUNLOCK		0x4B
-#define I2O_CMD_BLOCK_MMOUNT		0x41
-#define I2O_CMD_BLOCK_MEJECT		0x43
-#define I2O_CMD_BLOCK_POWER		0x70
-
-#define I2O_CMD_PRIVATE			0xFF
-
-/* Command status values  */
-
-#define I2O_CMD_IN_PROGRESS	0x01
-#define I2O_CMD_REJECTED	0x02
-#define I2O_CMD_FAILED		0x03
-#define I2O_CMD_COMPLETED	0x04
-
-/* I2O API function return values */
-
-#define I2O_RTN_NO_ERROR			0
-#define I2O_RTN_NOT_INIT			1
-#define I2O_RTN_FREE_Q_EMPTY			2
-#define I2O_RTN_TCB_ERROR			3
-#define I2O_RTN_TRANSACTION_ERROR		4
-#define I2O_RTN_ADAPTER_ALREADY_INIT		5
-#define I2O_RTN_MALLOC_ERROR			6
-#define I2O_RTN_ADPTR_NOT_REGISTERED		7
-#define I2O_RTN_MSG_REPLY_TIMEOUT		8
-#define I2O_RTN_NO_STATUS			9
-#define I2O_RTN_NO_FIRM_VER			10
-#define	I2O_RTN_NO_LINK_SPEED			11
-
-/* Reply message status defines for all messages */
-
-#define I2O_REPLY_STATUS_SUCCESS                    	0x00
-#define I2O_REPLY_STATUS_ABORT_DIRTY                	0x01
-#define I2O_REPLY_STATUS_ABORT_NO_DATA_TRANSFER     	0x02
-#define	I2O_REPLY_STATUS_ABORT_PARTIAL_TRANSFER		0x03
-#define	I2O_REPLY_STATUS_ERROR_DIRTY			0x04
-#define	I2O_REPLY_STATUS_ERROR_NO_DATA_TRANSFER		0x05
-#define	I2O_REPLY_STATUS_ERROR_PARTIAL_TRANSFER		0x06
-#define	I2O_REPLY_STATUS_PROCESS_ABORT_DIRTY		0x08
-#define	I2O_REPLY_STATUS_PROCESS_ABORT_NO_DATA_TRANSFER	0x09
-#define	I2O_REPLY_STATUS_PROCESS_ABORT_PARTIAL_TRANSFER	0x0A
-#define	I2O_REPLY_STATUS_TRANSACTION_ERROR		0x0B
-#define	I2O_REPLY_STATUS_PROGRESS_REPORT		0x80
-
-/* Status codes and Error Information for Parameter functions */
-
-#define I2O_PARAMS_STATUS_SUCCESS		0x00
-#define I2O_PARAMS_STATUS_BAD_KEY_ABORT		0x01
-#define I2O_PARAMS_STATUS_BAD_KEY_CONTINUE   	0x02
-#define I2O_PARAMS_STATUS_BUFFER_FULL		0x03
-#define I2O_PARAMS_STATUS_BUFFER_TOO_SMALL	0x04
-#define I2O_PARAMS_STATUS_FIELD_UNREADABLE	0x05
-#define I2O_PARAMS_STATUS_FIELD_UNWRITEABLE	0x06
-#define I2O_PARAMS_STATUS_INSUFFICIENT_FIELDS	0x07
-#define I2O_PARAMS_STATUS_INVALID_GROUP_ID	0x08
-#define I2O_PARAMS_STATUS_INVALID_OPERATION	0x09
-#define I2O_PARAMS_STATUS_NO_KEY_FIELD		0x0A
-#define I2O_PARAMS_STATUS_NO_SUCH_FIELD		0x0B
-#define I2O_PARAMS_STATUS_NON_DYNAMIC_GROUP	0x0C
-#define I2O_PARAMS_STATUS_OPERATION_ERROR	0x0D
-#define I2O_PARAMS_STATUS_SCALAR_ERROR		0x0E
-#define I2O_PARAMS_STATUS_TABLE_ERROR		0x0F
-#define I2O_PARAMS_STATUS_WRONG_GROUP_TYPE	0x10
-
-/* DetailedStatusCode defines for Executive, DDM, Util and Transaction error
- * messages: Table 3-2 Detailed Status Codes.*/
-
-#define I2O_DSC_SUCCESS                        0x0000
-#define I2O_DSC_BAD_KEY                        0x0002
-#define I2O_DSC_TCL_ERROR                      0x0003
-#define I2O_DSC_REPLY_BUFFER_FULL              0x0004
-#define I2O_DSC_NO_SUCH_PAGE                   0x0005
-#define I2O_DSC_INSUFFICIENT_RESOURCE_SOFT     0x0006
-#define I2O_DSC_INSUFFICIENT_RESOURCE_HARD     0x0007
-#define I2O_DSC_CHAIN_BUFFER_TOO_LARGE         0x0009
-#define I2O_DSC_UNSUPPORTED_FUNCTION           0x000A
-#define I2O_DSC_DEVICE_LOCKED                  0x000B
-#define I2O_DSC_DEVICE_RESET                   0x000C
-#define I2O_DSC_INAPPROPRIATE_FUNCTION         0x000D
-#define I2O_DSC_INVALID_INITIATOR_ADDRESS      0x000E
-#define I2O_DSC_INVALID_MESSAGE_FLAGS          0x000F
-#define I2O_DSC_INVALID_OFFSET                 0x0010
-#define I2O_DSC_INVALID_PARAMETER              0x0011
-#define I2O_DSC_INVALID_REQUEST                0x0012
-#define I2O_DSC_INVALID_TARGET_ADDRESS         0x0013
-#define I2O_DSC_MESSAGE_TOO_LARGE              0x0014
-#define I2O_DSC_MESSAGE_TOO_SMALL              0x0015
-#define I2O_DSC_MISSING_PARAMETER              0x0016
-#define I2O_DSC_TIMEOUT                        0x0017
-#define I2O_DSC_UNKNOWN_ERROR                  0x0018
-#define I2O_DSC_UNKNOWN_FUNCTION               0x0019
-#define I2O_DSC_UNSUPPORTED_VERSION            0x001A
-#define I2O_DSC_DEVICE_BUSY                    0x001B
-#define I2O_DSC_DEVICE_NOT_AVAILABLE           0x001C
-
-/* DetailedStatusCode defines for Block Storage Operation: Table 6-7 Detailed
-   Status Codes.*/
-
-#define I2O_BSA_DSC_SUCCESS               0x0000
-#define I2O_BSA_DSC_MEDIA_ERROR           0x0001
-#define I2O_BSA_DSC_ACCESS_ERROR          0x0002
-#define I2O_BSA_DSC_DEVICE_FAILURE        0x0003
-#define I2O_BSA_DSC_DEVICE_NOT_READY      0x0004
-#define I2O_BSA_DSC_MEDIA_NOT_PRESENT     0x0005
-#define I2O_BSA_DSC_MEDIA_LOCKED          0x0006
-#define I2O_BSA_DSC_MEDIA_FAILURE         0x0007
-#define I2O_BSA_DSC_PROTOCOL_FAILURE      0x0008
-#define I2O_BSA_DSC_BUS_FAILURE           0x0009
-#define I2O_BSA_DSC_ACCESS_VIOLATION      0x000A
-#define I2O_BSA_DSC_WRITE_PROTECTED       0x000B
-#define I2O_BSA_DSC_DEVICE_RESET          0x000C
-#define I2O_BSA_DSC_VOLUME_CHANGED        0x000D
-#define I2O_BSA_DSC_TIMEOUT               0x000E
-
-/* FailureStatusCodes, Table 3-3 Message Failure Codes */
-
-#define I2O_FSC_TRANSPORT_SERVICE_SUSPENDED             0x81
-#define I2O_FSC_TRANSPORT_SERVICE_TERMINATED            0x82
-#define I2O_FSC_TRANSPORT_CONGESTION                    0x83
-#define I2O_FSC_TRANSPORT_FAILURE                       0x84
-#define I2O_FSC_TRANSPORT_STATE_ERROR                   0x85
-#define I2O_FSC_TRANSPORT_TIME_OUT                      0x86
-#define I2O_FSC_TRANSPORT_ROUTING_FAILURE               0x87
-#define I2O_FSC_TRANSPORT_INVALID_VERSION               0x88
-#define I2O_FSC_TRANSPORT_INVALID_OFFSET                0x89
-#define I2O_FSC_TRANSPORT_INVALID_MSG_FLAGS             0x8A
-#define I2O_FSC_TRANSPORT_FRAME_TOO_SMALL               0x8B
-#define I2O_FSC_TRANSPORT_FRAME_TOO_LARGE               0x8C
-#define I2O_FSC_TRANSPORT_INVALID_TARGET_ID             0x8D
-#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_ID          0x8E
-#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_CONTEXT     0x8F
-#define I2O_FSC_TRANSPORT_UNKNOWN_FAILURE               0xFF
-
-/* Device Claim Types */
-#define	I2O_CLAIM_PRIMARY					0x01000000
-#define	I2O_CLAIM_MANAGEMENT					0x02000000
-#define	I2O_CLAIM_AUTHORIZED					0x03000000
-#define	I2O_CLAIM_SECONDARY					0x04000000
-
-/* Message header defines for VersionOffset */
-#define I2OVER15	0x0001
-#define I2OVER20	0x0002
-
-/* Default is 1.5 */
-#define I2OVERSION	I2OVER15
-
-#define SGL_OFFSET_0    I2OVERSION
-#define SGL_OFFSET_4    (0x0040 | I2OVERSION)
-#define SGL_OFFSET_5    (0x0050 | I2OVERSION)
-#define SGL_OFFSET_6    (0x0060 | I2OVERSION)
-#define SGL_OFFSET_7    (0x0070 | I2OVERSION)
-#define SGL_OFFSET_8    (0x0080 | I2OVERSION)
-#define SGL_OFFSET_9    (0x0090 | I2OVERSION)
-#define SGL_OFFSET_10   (0x00A0 | I2OVERSION)
-#define SGL_OFFSET_11   (0x00B0 | I2OVERSION)
-#define SGL_OFFSET_12   (0x00C0 | I2OVERSION)
-#define SGL_OFFSET(x)   (((x)<<4) | I2OVERSION)
-
-/* Transaction Reply Lists (TRL) Control Word structure */
-#define TRL_SINGLE_FIXED_LENGTH		0x00
-#define TRL_SINGLE_VARIABLE_LENGTH	0x40
-#define TRL_MULTIPLE_FIXED_LENGTH	0x80
-
- /* msg header defines for MsgFlags */
-#define MSG_STATIC	0x0100
-#define MSG_64BIT_CNTXT	0x0200
-#define MSG_MULTI_TRANS	0x1000
-#define MSG_FAIL	0x2000
-#define MSG_FINAL	0x4000
-#define MSG_REPLY	0x8000
-
- /* minimum size msg */
-#define THREE_WORD_MSG_SIZE	0x00030000
-#define FOUR_WORD_MSG_SIZE	0x00040000
-#define FIVE_WORD_MSG_SIZE	0x00050000
-#define SIX_WORD_MSG_SIZE	0x00060000
-#define SEVEN_WORD_MSG_SIZE	0x00070000
-#define EIGHT_WORD_MSG_SIZE	0x00080000
-#define NINE_WORD_MSG_SIZE	0x00090000
-#define TEN_WORD_MSG_SIZE	0x000A0000
-#define ELEVEN_WORD_MSG_SIZE	0x000B0000
-#define I2O_MESSAGE_SIZE(x)	((x)<<16)
-
-/* special TID assignments */
-#define ADAPTER_TID		0
-#define HOST_TID		1
-
-/* outbound queue defines */
-#define I2O_MAX_OUTBOUND_MSG_FRAMES	128
-#define I2O_OUTBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
-
-/* inbound queue definitions */
-#define I2O_MSG_INPOOL_MIN		32
-#define I2O_INBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
-
-#define I2O_POST_WAIT_OK	0
-#define I2O_POST_WAIT_TIMEOUT	-ETIMEDOUT
-
-#define I2O_CONTEXT_LIST_MIN_LENGTH	15
-#define I2O_CONTEXT_LIST_USED		0x01
-#define I2O_CONTEXT_LIST_DELETED	0x02
-
-/* timeouts */
-#define I2O_TIMEOUT_INIT_OUTBOUND_QUEUE	15
-#define I2O_TIMEOUT_MESSAGE_GET		5
-#define I2O_TIMEOUT_RESET		30
-#define I2O_TIMEOUT_STATUS_GET		5
-#define I2O_TIMEOUT_LCT_GET		360
-#define I2O_TIMEOUT_SCSI_SCB_ABORT	240
-
-/* retries */
-#define I2O_HRT_GET_TRIES		3
-#define I2O_LCT_GET_TRIES		3
-
-/* defines for max_sectors and max_phys_segments */
-#define I2O_MAX_SECTORS			1024
-#define I2O_MAX_SECTORS_LIMITED		128
-#define I2O_MAX_PHYS_SEGMENTS		BLK_MAX_SEGMENTS
-
-/*
- *	Message structures
- */
-struct i2o_message {
-	union {
-		struct {
-			u8 version_offset;
-			u8 flags;
-			u16 size;
-			u32 target_tid:12;
-			u32 init_tid:12;
-			u32 function:8;
-			u32 icntxt;	/* initiator context */
-			u32 tcntxt;	/* transaction context */
-		} s;
-		u32 head[4];
-	} u;
-	/* List follows */
-	u32 body[0];
-};
-
-/* MFA and I2O message used by mempool */
-struct i2o_msg_mfa {
-	u32 mfa;		/* MFA returned by the controller */
-	struct i2o_message msg;	/* I2O message */
-};
-
-/*
- *	Each I2O device entity has one of these. There is one per device.
- */
-struct i2o_device {
-	i2o_lct_entry lct_data;	/* Device LCT information */
-
-	struct i2o_controller *iop;	/* Controlling IOP */
-	struct list_head list;	/* node in IOP devices list */
-
-	struct device device;
-
-	struct mutex lock;	/* device lock */
-};
-
-/*
- *	Event structure provided to the event handling function
- */
-struct i2o_event {
-	struct work_struct work;
-	struct i2o_device *i2o_dev;	/* I2O device pointer from which the
-					   event reply was initiated */
-	u16 size;		/* Size of data in 32-bit words */
-	u32 tcntxt;		/* Transaction context used at
-				   registration */
-	u32 event_indicator;	/* Event indicator from reply */
-	u32 data[0];		/* Event data from reply */
-};
-
-/*
- *	I2O classes which could be handled by the OSM
- */
-struct i2o_class_id {
-	u16 class_id:12;
-};
-
-/*
- *	I2O driver structure for OSMs
- */
-struct i2o_driver {
-	char *name;		/* OSM name */
-	int context;		/* Low 8 bits of the transaction info */
-	struct i2o_class_id *classes;	/* I2O classes that this OSM handles */
-
-	/* Message reply handler */
-	int (*reply) (struct i2o_controller *, u32, struct i2o_message *);
-
-	/* Event handler */
-	work_func_t event;
-
-	struct workqueue_struct *event_queue;	/* Event queue */
-
-	struct device_driver driver;
-
-	/* notification of changes */
-	void (*notify_controller_add) (struct i2o_controller *);
-	void (*notify_controller_remove) (struct i2o_controller *);
-	void (*notify_device_add) (struct i2o_device *);
-	void (*notify_device_remove) (struct i2o_device *);
-
-	struct semaphore lock;
-};
-
-/*
- *	Contains DMA mapped address information
- */
-struct i2o_dma {
-	void *virt;
-	dma_addr_t phys;
-	size_t len;
-};
-
-/*
- *	Contains slab cache and mempool information
- */
-struct i2o_pool {
-	char *name;
-	struct kmem_cache *slab;
-	mempool_t *mempool;
-};
-
-/*
- *	Contains IO mapped address information
- */
-struct i2o_io {
-	void __iomem *virt;
-	unsigned long phys;
-	unsigned long len;
-};
-
-/*
- *	Context queue entry, used for 32-bit context on 64-bit systems
- */
-struct i2o_context_list_element {
-	struct list_head list;
-	u32 context;
-	void *ptr;
-	unsigned long timestamp;
-};
-
-/*
- * Each I2O controller has one of these objects
- */
-struct i2o_controller {
-	char name[16];
-	int unit;
-	int type;
-
-	struct pci_dev *pdev;	/* PCI device */
-
-	unsigned int promise:1;	/* Promise controller */
-	unsigned int adaptec:1;	/* DPT / Adaptec controller */
-	unsigned int raptor:1;	/* split bar */
-	unsigned int no_quiesce:1;	/* dont quiesce before reset */
-	unsigned int short_req:1;	/* use small block sizes */
-	unsigned int limit_sectors:1;	/* limit number of sectors / request */
-	unsigned int pae_support:1;	/* controller has 64-bit SGL support */
-
-	struct list_head devices;	/* list of I2O devices */
-	struct list_head list;	/* Controller list */
-
-	void __iomem *in_port;	/* Inbout port address */
-	void __iomem *out_port;	/* Outbound port address */
-	void __iomem *irq_status;	/* Interrupt status register address */
-	void __iomem *irq_mask;	/* Interrupt mask register address */
-
-	struct i2o_dma status;	/* IOP status block */
-
-	struct i2o_dma hrt;	/* HW Resource Table */
-	i2o_lct *lct;		/* Logical Config Table */
-	struct i2o_dma dlct;	/* Temp LCT */
-	struct mutex lct_lock;	/* Lock for LCT updates */
-	struct i2o_dma status_block;	/* IOP status block */
-
-	struct i2o_io base;	/* controller messaging unit */
-	struct i2o_io in_queue;	/* inbound message queue Host->IOP */
-	struct i2o_dma out_queue;	/* outbound message queue IOP->Host */
-
-	struct i2o_pool in_msg;	/* mempool for inbound messages */
-
-	unsigned int battery:1;	/* Has a battery backup */
-	unsigned int io_alloc:1;	/* An I/O resource was allocated */
-	unsigned int mem_alloc:1;	/* A memory resource was allocated */
-
-	struct resource io_resource;	/* I/O resource allocated to the IOP */
-	struct resource mem_resource;	/* Mem resource allocated to the IOP */
-
-	struct device device;
-	struct i2o_device *exec;	/* Executive */
-#if BITS_PER_LONG == 64
-	spinlock_t context_list_lock;	/* lock for context_list */
-	atomic_t context_list_counter;	/* needed for unique contexts */
-	struct list_head context_list;	/* list of context id's
-					   and pointers */
-#endif
-	spinlock_t lock;	/* lock for controller
-				   configuration */
-	void *driver_data[I2O_MAX_DRIVERS];	/* storage for drivers */
-};
-
-/*
- * I2O System table entry
- *
- * The system table contains information about all the IOPs in the
- * system.  It is sent to all IOPs so that they can create peer2peer
- * connections between them.
- */
-struct i2o_sys_tbl_entry {
-	u16 org_id;
-	u16 reserved1;
-	u32 iop_id:12;
-	u32 reserved2:20;
-	u16 seg_num:12;
-	u16 i2o_version:4;
-	u8 iop_state;
-	u8 msg_type;
-	u16 frame_size;
-	u16 reserved3;
-	u32 last_changed;
-	u32 iop_capabilities;
-	u32 inbound_low;
-	u32 inbound_high;
-};
-
-struct i2o_sys_tbl {
-	u8 num_entries;
-	u8 version;
-	u16 reserved1;
-	u32 change_ind;
-	u32 reserved2;
-	u32 reserved3;
-	struct i2o_sys_tbl_entry iops[0];
-};
-
-extern struct list_head i2o_controllers;
-
-/* Message functions */
-extern struct i2o_message *i2o_msg_get_wait(struct i2o_controller *, int);
-extern int i2o_msg_post_wait_mem(struct i2o_controller *, struct i2o_message *,
-				 unsigned long, struct i2o_dma *);
-
-/* IOP functions */
-extern int i2o_status_get(struct i2o_controller *);
-
-extern int i2o_event_register(struct i2o_device *, struct i2o_driver *, int,
-			      u32);
-extern struct i2o_device *i2o_iop_find_device(struct i2o_controller *, u16);
-extern struct i2o_controller *i2o_find_iop(int);
-
-/* Functions needed for handling 64-bit pointers in 32-bit context */
-#if BITS_PER_LONG == 64
-extern u32 i2o_cntxt_list_add(struct i2o_controller *, void *);
-extern void *i2o_cntxt_list_get(struct i2o_controller *, u32);
-extern u32 i2o_cntxt_list_remove(struct i2o_controller *, void *);
-extern u32 i2o_cntxt_list_get_ptr(struct i2o_controller *, void *);
-
-static inline u32 i2o_ptr_low(void *ptr)
-{
-	return (u32) (u64) ptr;
-};
-
-static inline u32 i2o_ptr_high(void *ptr)
-{
-	return (u32) ((u64) ptr >> 32);
-};
-
-static inline u32 i2o_dma_low(dma_addr_t dma_addr)
-{
-	return (u32) (u64) dma_addr;
-};
-
-static inline u32 i2o_dma_high(dma_addr_t dma_addr)
-{
-	return (u32) ((u64) dma_addr >> 32);
-};
-#else
-static inline u32 i2o_cntxt_list_add(struct i2o_controller *c, void *ptr)
-{
-	return (u32) ptr;
-};
-
-static inline void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
-{
-	return (void *)context;
-};
-
-static inline u32 i2o_cntxt_list_remove(struct i2o_controller *c, void *ptr)
-{
-	return (u32) ptr;
-};
-
-static inline u32 i2o_cntxt_list_get_ptr(struct i2o_controller *c, void *ptr)
-{
-	return (u32) ptr;
-};
-
-static inline u32 i2o_ptr_low(void *ptr)
-{
-	return (u32) ptr;
-};
-
-static inline u32 i2o_ptr_high(void *ptr)
-{
-	return 0;
-};
-
-static inline u32 i2o_dma_low(dma_addr_t dma_addr)
-{
-	return (u32) dma_addr;
-};
-
-static inline u32 i2o_dma_high(dma_addr_t dma_addr)
-{
-	return 0;
-};
-#endif
-
-extern u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size);
-extern dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
-					    size_t size,
-					    enum dma_data_direction direction,
-					    u32 ** sg_ptr);
-extern int i2o_dma_map_sg(struct i2o_controller *c,
-				 struct scatterlist *sg, int sg_count,
-				 enum dma_data_direction direction,
-				 u32 ** sg_ptr);
-extern int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr, size_t len);
-extern void i2o_dma_free(struct device *dev, struct i2o_dma *addr);
-extern int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr,
-								size_t len);
-extern int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
-				 size_t size, int min_nr);
-extern void i2o_pool_free(struct i2o_pool *pool);
-/* I2O driver (OSM) functions */
-extern int i2o_driver_register(struct i2o_driver *);
-extern void i2o_driver_unregister(struct i2o_driver *);
-
-/**
- *	i2o_driver_notify_controller_add - Send notification of added controller
- *	@drv: I2O driver
- *	@c: I2O controller
- *
- *	Send notification of added controller to a single registered driver.
- */
-static inline void i2o_driver_notify_controller_add(struct i2o_driver *drv,
-						    struct i2o_controller *c)
-{
-	if (drv->notify_controller_add)
-		drv->notify_controller_add(c);
-};
-
-/**
- *	i2o_driver_notify_controller_remove - Send notification of removed controller
- *	@drv: I2O driver
- *	@c: I2O controller
- *
- *	Send notification of removed controller to a single registered driver.
- */
-static inline void i2o_driver_notify_controller_remove(struct i2o_driver *drv,
-						       struct i2o_controller *c)
-{
-	if (drv->notify_controller_remove)
-		drv->notify_controller_remove(c);
-};
-
-/**
- *	i2o_driver_notify_device_add - Send notification of added device
- *	@drv: I2O driver
- *	@i2o_dev: the added i2o_device
- *
- *	Send notification of added device to a single registered driver.
- */
-static inline void i2o_driver_notify_device_add(struct i2o_driver *drv,
-						struct i2o_device *i2o_dev)
-{
-	if (drv->notify_device_add)
-		drv->notify_device_add(i2o_dev);
-};
-
-/**
- *	i2o_driver_notify_device_remove - Send notification of removed device
- *	@drv: I2O driver
- *	@i2o_dev: the added i2o_device
- *
- *	Send notification of removed device to a single registered driver.
- */
-static inline void i2o_driver_notify_device_remove(struct i2o_driver *drv,
-						   struct i2o_device *i2o_dev)
-{
-	if (drv->notify_device_remove)
-		drv->notify_device_remove(i2o_dev);
-};
-
-extern void i2o_driver_notify_controller_add_all(struct i2o_controller *);
-extern void i2o_driver_notify_controller_remove_all(struct i2o_controller *);
-extern void i2o_driver_notify_device_add_all(struct i2o_device *);
-extern void i2o_driver_notify_device_remove_all(struct i2o_device *);
-
-/* I2O device functions */
-extern int i2o_device_claim(struct i2o_device *);
-extern int i2o_device_claim_release(struct i2o_device *);
-
-/* Exec OSM functions */
-extern int i2o_exec_lct_get(struct i2o_controller *);
-
-/* device / driver / kobject conversion functions */
-#define to_i2o_driver(drv) container_of(drv,struct i2o_driver, driver)
-#define to_i2o_device(dev) container_of(dev, struct i2o_device, device)
-#define to_i2o_controller(dev) container_of(dev, struct i2o_controller, device)
-
-/**
- *	i2o_out_to_virt - Turn an I2O message to a virtual address
- *	@c: controller
- *	@m: message engine value
- *
- *	Turn a receive message from an I2O controller bus address into
- *	a Linux virtual address. The shared page frame is a linear block
- *	so we simply have to shift the offset. This function does not
- *	work for sender side messages as they are ioremap objects
- *	provided by the I2O controller.
- */
-static inline struct i2o_message *i2o_msg_out_to_virt(struct i2o_controller *c,
-						      u32 m)
-{
-	BUG_ON(m < c->out_queue.phys
-	       || m >= c->out_queue.phys + c->out_queue.len);
-
-	return c->out_queue.virt + (m - c->out_queue.phys);
-};
-
-/**
- *	i2o_msg_in_to_virt - Turn an I2O message to a virtual address
- *	@c: controller
- *	@m: message engine value
- *
- *	Turn a send message from an I2O controller bus address into
- *	a Linux virtual address. The shared page frame is a linear block
- *	so we simply have to shift the offset. This function does not
- *	work for receive side messages as they are kmalloc objects
- *	in a different pool.
- */
-static inline struct i2o_message __iomem *i2o_msg_in_to_virt(struct
-							     i2o_controller *c,
-							     u32 m)
-{
-	return c->in_queue.virt + m;
-};
-
-/**
- *	i2o_msg_get - obtain an I2O message from the IOP
- *	@c: I2O controller
- *
- *	This function tries to get a message frame. If no message frame is
- *	available do not wait until one is available (see also i2o_msg_get_wait).
- *	The returned pointer to the message frame is not in I/O memory, it is
- *	allocated from a mempool. But because a MFA is allocated from the
- *	controller too it is guaranteed that i2o_msg_post() will never fail.
- *
- *	On a success a pointer to the message frame is returned. If the message
- *	queue is empty -EBUSY is returned and if no memory is available -ENOMEM
- *	is returned.
- */
-static inline struct i2o_message *i2o_msg_get(struct i2o_controller *c)
-{
-	struct i2o_msg_mfa *mmsg = mempool_alloc(c->in_msg.mempool, GFP_ATOMIC);
-	if (!mmsg)
-		return ERR_PTR(-ENOMEM);
-
-	mmsg->mfa = readl(c->in_port);
-	if (unlikely(mmsg->mfa >= c->in_queue.len)) {
-		u32 mfa = mmsg->mfa;
-
-		mempool_free(mmsg, c->in_msg.mempool);
-
-		if (mfa == I2O_QUEUE_EMPTY)
-			return ERR_PTR(-EBUSY);
-		return ERR_PTR(-EFAULT);
-	}
-
-	return &mmsg->msg;
-};
-
-/**
- *	i2o_msg_post - Post I2O message to I2O controller
- *	@c: I2O controller to which the message should be send
- *	@msg: message returned by i2o_msg_get()
- *
- *	Post the message to the I2O controller and return immediately.
- */
-static inline void i2o_msg_post(struct i2o_controller *c,
-				struct i2o_message *msg)
-{
-	struct i2o_msg_mfa *mmsg;
-
-	mmsg = container_of(msg, struct i2o_msg_mfa, msg);
-	memcpy_toio(i2o_msg_in_to_virt(c, mmsg->mfa), msg,
-		    (le32_to_cpu(msg->u.head[0]) >> 16) << 2);
-	writel(mmsg->mfa, c->in_port);
-	mempool_free(mmsg, c->in_msg.mempool);
-};
-
-/**
- * 	i2o_msg_post_wait - Post and wait a message and wait until return
- *	@c: controller
- *	@msg: message to post
- *	@timeout: time in seconds to wait
- *
- * 	This API allows an OSM to post a message and then be told whether or
- *	not the system received a successful reply. If the message times out
- *	then the value '-ETIMEDOUT' is returned.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static inline int i2o_msg_post_wait(struct i2o_controller *c,
-				    struct i2o_message *msg,
-				    unsigned long timeout)
-{
-	return i2o_msg_post_wait_mem(c, msg, timeout, NULL);
-};
-
-/**
- *	i2o_msg_nop_mfa - Returns a fetched MFA back to the controller
- *	@c: I2O controller from which the MFA was fetched
- *	@mfa: MFA which should be returned
- *
- *	This function must be used for preserved messages, because i2o_msg_nop()
- *	also returns the allocated memory back to the msg_pool mempool.
- */
-static inline void i2o_msg_nop_mfa(struct i2o_controller *c, u32 mfa)
-{
-	struct i2o_message __iomem *msg;
-	u32 nop[3] = {
-		THREE_WORD_MSG_SIZE | SGL_OFFSET_0,
-		I2O_CMD_UTIL_NOP << 24 | HOST_TID << 12 | ADAPTER_TID,
-		0x00000000
-	};
-
-	msg = i2o_msg_in_to_virt(c, mfa);
-	memcpy_toio(msg, nop, sizeof(nop));
-	writel(mfa, c->in_port);
-};
-
-/**
- *	i2o_msg_nop - Returns a message which is not used
- *	@c: I2O controller from which the message was created
- *	@msg: message which should be returned
- *
- *	If you fetch a message via i2o_msg_get, and can't use it, you must
- *	return the message with this function. Otherwise the MFA is lost as well
- *	as the allocated memory from the mempool.
- */
-static inline void i2o_msg_nop(struct i2o_controller *c,
-			       struct i2o_message *msg)
-{
-	struct i2o_msg_mfa *mmsg;
-	mmsg = container_of(msg, struct i2o_msg_mfa, msg);
-
-	i2o_msg_nop_mfa(c, mmsg->mfa);
-	mempool_free(mmsg, c->in_msg.mempool);
-};
-
-/**
- *	i2o_flush_reply - Flush reply from I2O controller
- *	@c: I2O controller
- *	@m: the message identifier
- *
- *	The I2O controller must be informed that the reply message is not needed
- *	anymore. If you forget to flush the reply, the message frame can't be
- *	used by the controller anymore and is therefore lost.
- */
-static inline void i2o_flush_reply(struct i2o_controller *c, u32 m)
-{
-	writel(m, c->out_port);
-};
-
-/*
- *	Endian handling wrapped into the macro - keeps the core code
- *	cleaner.
- */
-
-#define i2o_raw_writel(val, mem)	__raw_writel(cpu_to_le32(val), mem)
-
-extern int i2o_parm_field_get(struct i2o_device *, int, int, void *, int);
-extern int i2o_parm_table_get(struct i2o_device *, int, int, int, void *, int,
-			      void *, int);
-
-/* debugging and troubleshooting/diagnostic helpers. */
-#define osm_printk(level, format, arg...)  \
-	printk(level "%s: " format, OSM_NAME , ## arg)
-
-#ifdef DEBUG
-#define osm_debug(format, arg...) \
-	osm_printk(KERN_DEBUG, format , ## arg)
-#else
-#define osm_debug(format, arg...) \
-        do { } while (0)
-#endif
-
-#define osm_err(format, arg...)		\
-	osm_printk(KERN_ERR, format , ## arg)
-#define osm_info(format, arg...)		\
-	osm_printk(KERN_INFO, format , ## arg)
-#define osm_warn(format, arg...)		\
-	osm_printk(KERN_WARNING, format , ## arg)
-
-/* debugging functions */
-extern void i2o_report_status(const char *, const char *, struct i2o_message *);
-extern void i2o_dump_message(struct i2o_message *);
-extern void i2o_dump_hrt(struct i2o_controller *c);
-extern void i2o_debug_state(struct i2o_controller *c);
-
-#endif				/* _I2O_H */
-- 
cgit v1.2.3


From 851c63e3b381fdbf5aca1a797f37d8606b5588d2 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Mon, 5 Jan 2015 15:43:39 +0000
Subject: of: Fix brace position for struct of_device_id definition

Currently it is not easy to grep for the definition of struct of_device_id.
This is trivially fixed by moving the brace to the right place.

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Rob Herring <robh+dt@kernel.org>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/mod_devicetable.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 745def862580..bbf85d612be5 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -220,8 +220,7 @@ struct serio_device_id {
 /*
  * Struct used for matching a device
  */
-struct of_device_id
-{
+struct of_device_id {
 	char	name[32];
 	char	type[32];
 	char	compatible[128];
-- 
cgit v1.2.3


From 1e0fb9ec679c9273a641f1d6f3d25ea47baef2bb Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 24 Oct 2014 15:58:10 -0700
Subject: perf: Add pmu callbacks to track event mapping and unmapping

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Vince Weaver <vince@deater.net>
Cc: "hillf.zj" <hillf.zj@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/266afcba1d1f91ea5501e4e16e94bbbc1a9339b6.1414190806.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 7 +++++++
 kernel/events/core.c       | 9 +++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5cad0e6f3552..33262004c310 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -202,6 +202,13 @@ struct pmu {
 	 */
 	int (*event_init)		(struct perf_event *event);
 
+	/*
+	 * Notification that the event was mapped or unmapped.  Called
+	 * in the context of the mapping task.
+	 */
+	void (*event_mapped)		(struct perf_event *event); /*optional*/
+	void (*event_unmapped)		(struct perf_event *event); /*optional*/
+
 #define PERF_EF_START	0x01		/* start the counter when adding    */
 #define PERF_EF_RELOAD	0x02		/* reload the counter when starting */
 #define PERF_EF_UPDATE	0x04		/* update the counter when stopping */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7f2fbb8b5069..cc1487145d33 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4293,6 +4293,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 
 	atomic_inc(&event->mmap_count);
 	atomic_inc(&event->rb->mmap_count);
+
+	if (event->pmu->event_mapped)
+		event->pmu->event_mapped(event);
 }
 
 /*
@@ -4312,6 +4315,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	int mmap_locked = rb->mmap_locked;
 	unsigned long size = perf_data_size(rb);
 
+	if (event->pmu->event_unmapped)
+		event->pmu->event_unmapped(event);
+
 	atomic_dec(&rb->mmap_count);
 
 	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4513,6 +4519,9 @@ unlock:
 	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = &perf_mmap_vmops;
 
+	if (event->pmu->event_mapped)
+		event->pmu->event_mapped(event);
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 2c561246524c3319473bf47b558354f7ff47f0cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 3 Feb 2015 12:55:31 +0100
Subject: block: Simplify bsg complete all

It took me a few tries to figure out what this code did; lets rewrite
it into a more regular form.

The thing that makes this one 'special' is the BSG_F_BLOCK flag, if
that is not set we're not supposed/allowed to block and should spin
wait for completion.

The (new) io_wait_event() will never see a false condition in case of
the spinning and we will therefore not block.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bsg.c          | 72 ++++++++++++++++++----------------------------------
 include/linux/wait.h | 15 +++++++++++
 2 files changed, 40 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg.c b/block/bsg.c
index 276e869e686c..d214e929ce18 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -136,42 +136,6 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index)
 	return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
 }
 
-static int bsg_io_schedule(struct bsg_device *bd)
-{
-	DEFINE_WAIT(wait);
-	int ret = 0;
-
-	spin_lock_irq(&bd->lock);
-
-	BUG_ON(bd->done_cmds > bd->queued_cmds);
-
-	/*
-	 * -ENOSPC or -ENODATA?  I'm going for -ENODATA, meaning "I have no
-	 * work to do", even though we return -ENOSPC after this same test
-	 * during bsg_write() -- there, it means our buffer can't have more
-	 * bsg_commands added to it, thus has no space left.
-	 */
-	if (bd->done_cmds == bd->queued_cmds) {
-		ret = -ENODATA;
-		goto unlock;
-	}
-
-	if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
-		ret = -EAGAIN;
-		goto unlock;
-	}
-
-	prepare_to_wait(&bd->wq_done, &wait, TASK_UNINTERRUPTIBLE);
-	spin_unlock_irq(&bd->lock);
-	io_schedule();
-	finish_wait(&bd->wq_done, &wait);
-
-	return ret;
-unlock:
-	spin_unlock_irq(&bd->lock);
-	return ret;
-}
-
 static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 				struct sg_io_v4 *hdr, struct bsg_device *bd,
 				fmode_t has_write_perm)
@@ -482,6 +446,30 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	return ret;
 }
 
+static bool bsg_complete(struct bsg_device *bd)
+{
+	bool ret = false;
+	bool spin;
+
+	do {
+		spin_lock_irq(&bd->lock);
+
+		BUG_ON(bd->done_cmds > bd->queued_cmds);
+
+		/*
+		 * All commands consumed.
+		 */
+		if (bd->done_cmds == bd->queued_cmds)
+			ret = true;
+
+		spin = !test_bit(BSG_F_BLOCK, &bd->flags);
+
+		spin_unlock_irq(&bd->lock);
+	} while (!ret && spin);
+
+	return ret;
+}
+
 static int bsg_complete_all_commands(struct bsg_device *bd)
 {
 	struct bsg_command *bc;
@@ -492,17 +480,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)
 	/*
 	 * wait for all commands to complete
 	 */
-	ret = 0;
-	do {
-		ret = bsg_io_schedule(bd);
-		/*
-		 * look for -ENODATA specifically -- we'll sometimes get
-		 * -ERESTARTSYS when we've taken a signal, but we can't
-		 * return until we're done freeing the queue, so ignore
-		 * it.  The signal will get handled when we're done freeing
-		 * the bsg_device.
-		 */
-	} while (ret != -ENODATA);
+	io_wait_event(bd->wq_done, bsg_complete(bd));
 
 	/*
 	 * discard done commands
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2232ed16635a..71fc1d31e48d 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -267,6 +267,21 @@ do {									\
 	__wait_event(wq, condition);					\
 } while (0)
 
+#define __io_wait_event(wq, condition)					\
+	(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
+			    io_schedule())
+
+/*
+ * io_wait_event() -- like wait_event() but with io_schedule()
+ */
+#define io_wait_event(wq, condition)					\
+do {									\
+	might_sleep();							\
+	if (condition)							\
+		break;							\
+	__io_wait_event(wq, condition);					\
+} while (0)
+
 #define __wait_event_freezable(wq, condition)				\
 	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,		\
 			    schedule(); try_to_freeze())
-- 
cgit v1.2.3


From c4546f246636ccf4cda092bcfcafcb5f5f752ec7 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Mon, 26 Jan 2015 09:22:22 -0700
Subject: coresight: remove the unnecessary function coresight_is_bit_set()

This function coresight_is_bit_set() isn't called, so we should
remove it.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/coresight.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index cd6d4c384ca3..3486b9082adb 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -227,7 +227,6 @@ coresight_register(struct coresight_desc *desc);
 extern void coresight_unregister(struct coresight_device *csdev);
 extern int coresight_enable(struct coresight_device *csdev);
 extern void coresight_disable(struct coresight_device *csdev);
-extern int coresight_is_bit_set(u32 val, int position, int value);
 extern int coresight_timeout(void __iomem *addr, u32 offset,
 			     int position, int value);
 #else
@@ -237,8 +236,6 @@ static inline void coresight_unregister(struct coresight_device *csdev) {}
 static inline int
 coresight_enable(struct coresight_device *csdev) { return -ENOSYS; }
 static inline void coresight_disable(struct coresight_device *csdev) {}
-static inline int coresight_is_bit_set(u32 val, int position, int value)
-					 { return 0; }
 static inline int coresight_timeout(void __iomem *addr, u32 offset,
 				     int position, int value) { return 1; }
 #endif
-- 
cgit v1.2.3


From 0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 2 Feb 2015 00:37:00 -0500
Subject: vfs: add support for a lazytime mount option

Add a new mount option which enables a new "lazytime" mode.  This mode
causes atime, mtime, and ctime updates to only be made to the
in-memory version of the inode.  The on-disk times will only get
updated when (a) if the inode needs to be updated for some non-time
related change, (b) if userspace calls fsync(), syncfs() or sync(), or
(c) just before an undeleted inode is evicted from memory.

This is OK according to POSIX because there are no guarantees after a
crash unless userspace explicitly requests via a fsync(2) call.

For workloads which feature a large number of random write to a
preallocated file, the lazytime mount option significantly reduces
writes to the inode table.  The repeated 4k writes to a single block
will result in undesirable stress on flash devices and SMR disk
drives.  Even on conventional HDD's, the repeated writes to the inode
table block will trigger Adjacent Track Interference (ATI) remediation
latencies, which very negatively impact long tail latencies --- which
is a very big deal for web serving tiers (for example).

Google-Bug-Id: 18297052

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/inode.c                  |  6 ++++
 fs/fs-writeback.c                | 62 +++++++++++++++++++++++++++++++++-------
 fs/gfs2/file.c                   |  4 +--
 fs/inode.c                       | 56 +++++++++++++++++++++++++-----------
 fs/jfs/file.c                    |  2 +-
 fs/libfs.c                       |  2 +-
 fs/proc_namespace.c              |  1 +
 fs/sync.c                        |  8 ++++++
 include/linux/backing-dev.h      |  1 +
 include/linux/fs.h               |  5 ++++
 include/trace/events/writeback.h | 60 +++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/fs.h          |  4 ++-
 mm/backing-dev.c                 | 10 +++++--
 13 files changed, 186 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5653fa42930b..628df5ba44a6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4840,11 +4840,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * If the inode is marked synchronous, we don't honour that here - doing
  * so would cause a commit on atime updates, which we don't bother doing.
  * We handle synchronous inodes at the highest possible level.
+ *
+ * If only the I_DIRTY_TIME flag is set, we can skip everything.  If
+ * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
+ * to copy into the on-disk inode structure are the timestamp files.
  */
 void ext4_dirty_inode(struct inode *inode, int flags)
 {
 	handle_t *handle;
 
+	if (flags == I_DIRTY_TIME)
+		return;
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 	if (IS_ERR(handle))
 		goto out;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5fbfea..004686191354 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -247,14 +247,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 	return ret;
 }
 
+#define EXPIRE_DIRTY_ATIME 0x0001
+
 /*
  * Move expired (dirtied before work->older_than_this) dirty inodes from
  * @delaying_queue to @dispatch_queue.
  */
 static int move_expired_inodes(struct list_head *delaying_queue,
 			       struct list_head *dispatch_queue,
+			       int flags,
 			       struct wb_writeback_work *work)
 {
+	unsigned long *older_than_this = NULL;
+	unsigned long expire_time;
 	LIST_HEAD(tmp);
 	struct list_head *pos, *node;
 	struct super_block *sb = NULL;
@@ -262,13 +267,21 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 	int do_sb_sort = 0;
 	int moved = 0;
 
+	if ((flags & EXPIRE_DIRTY_ATIME) == 0)
+		older_than_this = work->older_than_this;
+	else if ((work->reason == WB_REASON_SYNC) == 0) {
+		expire_time = jiffies - (HZ * 86400);
+		older_than_this = &expire_time;
+	}
 	while (!list_empty(delaying_queue)) {
 		inode = wb_inode(delaying_queue->prev);
-		if (work->older_than_this &&
-		    inode_dirtied_after(inode, *work->older_than_this))
+		if (older_than_this &&
+		    inode_dirtied_after(inode, *older_than_this))
 			break;
 		list_move(&inode->i_wb_list, &tmp);
 		moved++;
+		if (flags & EXPIRE_DIRTY_ATIME)
+			set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
 		if (sb_is_blkdev_sb(inode->i_sb))
 			continue;
 		if (sb && sb != inode->i_sb)
@@ -309,9 +322,12 @@ out:
 static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 {
 	int moved;
+
 	assert_spin_locked(&wb->list_lock);
 	list_splice_init(&wb->b_more_io, &wb->b_io);
-	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
+	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
+	moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
+				     EXPIRE_DIRTY_ATIME, work);
 	trace_writeback_queue_io(wb, work, moved);
 }
 
@@ -435,6 +451,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		 * updates after data IO completion.
 		 */
 		redirty_tail(inode, wb);
+	} else if (inode->i_state & I_DIRTY_TIME) {
+		list_move(&inode->i_wb_list, &wb->b_dirty_time);
 	} else {
 		/* The inode is clean. Remove from writeback lists. */
 		list_del_init(&inode->i_wb_list);
@@ -481,7 +499,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	spin_lock(&inode->i_lock);
 
 	dirty = inode->i_state & I_DIRTY;
-	inode->i_state &= ~I_DIRTY;
+	if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
+	     (inode->i_state & I_DIRTY_TIME)) ||
+	    (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
+		dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+		trace_writeback_lazytime(inode);
+	}
+	inode->i_state &= ~dirty;
 
 	/*
 	 * Paired with smp_mb() in __mark_inode_dirty().  This allows
@@ -501,8 +525,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
 	spin_unlock(&inode->i_lock);
 
+	if (dirty & I_DIRTY_TIME)
+		mark_inode_dirty_sync(inode);
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
-	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+	if (dirty & ~I_DIRTY_PAGES) {
 		int err = write_inode(inode, wbc);
 		if (ret == 0)
 			ret = err;
@@ -550,7 +576,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * make sure inode is on some writeback list and leave it there unless
 	 * we have completely cleaned the inode.
 	 */
-	if (!(inode->i_state & I_DIRTY) &&
+	if (!(inode->i_state & I_DIRTY_ALL) &&
 	    (wbc->sync_mode != WB_SYNC_ALL ||
 	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
 		goto out;
@@ -565,7 +591,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * If inode is clean, remove it from writeback lists. Otherwise don't
 	 * touch it. See comment above for explanation.
 	 */
-	if (!(inode->i_state & I_DIRTY))
+	if (!(inode->i_state & I_DIRTY_ALL))
 		list_del_init(&inode->i_wb_list);
 	spin_unlock(&wb->list_lock);
 	inode_sync_complete(inode);
@@ -707,7 +733,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		wrote += write_chunk - wbc.nr_to_write;
 		spin_lock(&wb->list_lock);
 		spin_lock(&inode->i_lock);
-		if (!(inode->i_state & I_DIRTY))
+		if (!(inode->i_state & I_DIRTY_ALL))
 			wrote++;
 		requeue_inode(inode, wb, &wbc);
 		inode_sync_complete(inode);
@@ -1145,16 +1171,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
  * page->mapping->host, so the page-dirtying time is recorded in the internal
  * blockdev inode.
  */
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
 	struct super_block *sb = inode->i_sb;
 	struct backing_dev_info *bdi = NULL;
+	int dirtytime;
+
+	trace_writeback_mark_inode_dirty(inode, flags);
 
 	/*
 	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
 	 * dirty the inode itself
 	 */
-	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
 		trace_writeback_dirty_inode_start(inode, flags);
 
 		if (sb->s_op->dirty_inode)
@@ -1162,6 +1192,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 
 		trace_writeback_dirty_inode(inode, flags);
 	}
+	if (flags & I_DIRTY_INODE)
+		flags &= ~I_DIRTY_TIME;
+	dirtytime = flags & I_DIRTY_TIME;
 
 	/*
 	 * Paired with smp_mb() in __writeback_single_inode() for the
@@ -1169,16 +1202,21 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	 */
 	smp_mb();
 
-	if ((inode->i_state & flags) == flags)
+	if (((inode->i_state & flags) == flags) ||
+	    (dirtytime && (inode->i_state & I_DIRTY_INODE)))
 		return;
 
 	if (unlikely(block_dump))
 		block_dump___mark_inode_dirty(inode);
 
 	spin_lock(&inode->i_lock);
+	if (dirtytime && (inode->i_state & I_DIRTY_INODE))
+		goto out_unlock_inode;
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
 
+		if (flags & I_DIRTY_INODE)
+			inode->i_state &= ~I_DIRTY_TIME;
 		inode->i_state |= flags;
 
 		/*
@@ -1225,8 +1263,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			}
 
 			inode->dirtied_when = jiffies;
-			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+			list_move(&inode->i_wb_list, dirtytime ?
+				  &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
 			spin_unlock(&bdi->wb.list_lock);
+			trace_writeback_dirty_inode_enqueue(inode);
 
 			if (wakeup_bdi)
 				bdi_wakeup_thread_delayed(bdi);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6e600abf694a..15c44cf457cc 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -655,7 +655,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 {
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	int sync_state = inode->i_state & I_DIRTY;
+	int sync_state = inode->i_state & I_DIRTY_ALL;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	int ret = 0, ret1 = 0;
 
@@ -668,7 +668,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 	if (!gfs2_is_jdata(ip))
 		sync_state &= ~I_DIRTY_PAGES;
 	if (datasync)
-		sync_state &= ~I_DIRTY_SYNC;
+		sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
 
 	if (sync_state) {
 		ret = sync_inode_metadata(inode, 1);
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..4feb85cc125f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,6 +18,7 @@
 #include <linux/buffer_head.h> /* for inode_has_buffers */
 #include <linux/ratelimit.h>
 #include <linux/list_lru.h>
+#include <trace/events/writeback.h>
 #include "internal.h"
 
 /*
@@ -30,7 +31,7 @@
  * inode_sb_list_lock protects:
  *   sb->s_inodes, inode->i_sb_list
  * bdi->wb.list_lock protects:
- *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
  * inode_hash_lock protects:
  *   inode_hashtable, inode->i_hash
  *
@@ -416,7 +417,8 @@ static void inode_lru_list_add(struct inode *inode)
  */
 void inode_add_lru(struct inode *inode)
 {
-	if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
+	if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
+				I_FREEING | I_WILL_FREE)) &&
 	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
 		inode_lru_list_add(inode);
 }
@@ -647,7 +649,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
-		if (inode->i_state & I_DIRTY && !kill_dirty) {
+		if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
 			spin_unlock(&inode->i_lock);
 			busy = 1;
 			continue;
@@ -1432,11 +1434,20 @@ static void iput_final(struct inode *inode)
  */
 void iput(struct inode *inode)
 {
-	if (inode) {
-		BUG_ON(inode->i_state & I_CLEAR);
-
-		if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
-			iput_final(inode);
+	if (!inode)
+		return;
+	BUG_ON(inode->i_state & I_CLEAR);
+retry:
+	if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
+		if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
+			atomic_inc(&inode->i_count);
+			inode->i_state &= ~I_DIRTY_TIME;
+			spin_unlock(&inode->i_lock);
+			trace_writeback_lazytime_iput(inode);
+			mark_inode_dirty_sync(inode);
+			goto retry;
+		}
+		iput_final(inode);
 	}
 }
 EXPORT_SYMBOL(iput);
@@ -1495,14 +1506,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
 	return 0;
 }
 
-/*
- * This does the actual work of updating an inodes time or version.  Must have
- * had called mnt_want_write() before calling this.
- */
-static int update_time(struct inode *inode, struct timespec *time, int flags)
+int generic_update_time(struct inode *inode, struct timespec *time, int flags)
 {
-	if (inode->i_op->update_time)
-		return inode->i_op->update_time(inode, time, flags);
+	int iflags = I_DIRTY_TIME;
 
 	if (flags & S_ATIME)
 		inode->i_atime = *time;
@@ -1512,9 +1518,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
 		inode->i_ctime = *time;
 	if (flags & S_MTIME)
 		inode->i_mtime = *time;
-	mark_inode_dirty_sync(inode);
+
+	if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
+		iflags |= I_DIRTY_SYNC;
+	__mark_inode_dirty(inode, iflags);
 	return 0;
 }
+EXPORT_SYMBOL(generic_update_time);
+
+/*
+ * This does the actual work of updating an inodes time or version.  Must have
+ * had called mnt_want_write() before calling this.
+ */
+static int update_time(struct inode *inode, struct timespec *time, int flags)
+{
+	int (*update_time)(struct inode *, struct timespec *, int);
+
+	update_time = inode->i_op->update_time ? inode->i_op->update_time :
+		generic_update_time;
+
+	return update_time(inode, time, flags);
+}
 
 /**
  *	touch_atime	-	update the access time
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 33aa0cc1f8b8..10815f8dfd8b 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		return rc;
 
 	mutex_lock(&inode->i_mutex);
-	if (!(inode->i_state & I_DIRTY) ||
+	if (!(inode->i_state & I_DIRTY_ALL) ||
 	    (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
 		/* Make sure committed changes hit the disk */
 		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
diff --git a/fs/libfs.c b/fs/libfs.c
index 005843ce5dbd..b2ffdb045be4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
 
 	mutex_lock(&inode->i_mutex);
 	ret = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
+	if (!(inode->i_state & I_DIRTY_ALL))
 		goto out;
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
 		goto out;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 0f96f71ab32b..8db932da4009 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 		{ MS_SYNCHRONOUS, ",sync" },
 		{ MS_DIRSYNC, ",dirsync" },
 		{ MS_MANDLOCK, ",mand" },
+		{ MS_LAZYTIME, ",lazytime" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
diff --git a/fs/sync.c b/fs/sync.c
index 01d9f18a70b5..fbc98ee62044 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
  */
 int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
 {
+	struct inode *inode = file->f_mapping->host;
+
 	if (!file->f_op->fsync)
 		return -EINVAL;
+	if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
+		spin_lock(&inode->i_lock);
+		inode->i_state &= ~I_DIRTY_TIME;
+		spin_unlock(&inode->i_lock);
+		mark_inode_dirty_sync(inode);
+	}
 	return file->f_op->fsync(file, start, end, datasync);
 }
 EXPORT_SYMBOL(vfs_fsync_range);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5da6012b7a14..4cdf7336f64a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -55,6 +55,7 @@ struct bdi_writeback {
 	struct list_head b_dirty;	/* dirty inodes */
 	struct list_head b_io;		/* parked for writeback */
 	struct list_head b_more_io;	/* parked for more writeback */
+	struct list_head b_dirty_time;	/* time stamps are dirty */
 	spinlock_t list_lock;		/* protects the b_* lists */
 };
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42efe13077b6..cd027ce2c705 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1746,8 +1746,12 @@ struct super_operations {
 #define __I_DIO_WAKEUP		9
 #define I_DIO_WAKEUP		(1 << I_DIO_WAKEUP)
 #define I_LINKABLE		(1 << 10)
+#define I_DIRTY_TIME		(1 << 11)
+#define __I_DIRTY_TIME_EXPIRED	12
+#define I_DIRTY_TIME_EXPIRED	(1 << __I_DIRTY_TIME_EXPIRED)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
+#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
 
 extern void __mark_inode_dirty(struct inode *, int);
 static inline void mark_inode_dirty(struct inode *inode)
@@ -1910,6 +1914,7 @@ extern int current_umask(void);
 
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
+extern int generic_update_time(struct inode *, struct timespec *, int);
 
 static inline struct inode *file_inode(const struct file *f)
 {
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index cee02d65ab3f..5ecb4c234625 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -18,6 +18,8 @@
 		{I_FREEING,		"I_FREEING"},		\
 		{I_CLEAR,		"I_CLEAR"},		\
 		{I_SYNC,		"I_SYNC"},		\
+		{I_DIRTY_TIME,		"I_DIRTY_TIME"},	\
+		{I_DIRTY_TIME_EXPIRED,	"I_DIRTY_TIME_EXPIRED"}, \
 		{I_REFERENCED,		"I_REFERENCED"}		\
 	)
 
@@ -68,6 +70,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
 	TP_STRUCT__entry (
 		__array(char, name, 32)
 		__field(unsigned long, ino)
+		__field(unsigned long, state)
 		__field(unsigned long, flags)
 	),
 
@@ -78,16 +81,25 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
 		strncpy(__entry->name,
 			bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
 		__entry->ino		= inode->i_ino;
+		__entry->state		= inode->i_state;
 		__entry->flags		= flags;
 	),
 
-	TP_printk("bdi %s: ino=%lu flags=%s",
+	TP_printk("bdi %s: ino=%lu state=%s flags=%s",
 		__entry->name,
 		__entry->ino,
+		show_inode_state(__entry->state),
 		show_inode_state(__entry->flags)
 	)
 );
 
+DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,
+
+	TP_PROTO(struct inode *inode, int flags),
+
+	TP_ARGS(inode, flags)
+);
+
 DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,
 
 	TP_PROTO(struct inode *inode, int flags),
@@ -598,6 +610,52 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
 	TP_ARGS(inode, wbc, nr_to_write)
 );
 
+DECLARE_EVENT_CLASS(writeback_lazytime_template,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(unsigned long,	ino			)
+		__field(unsigned long,	state			)
+		__field(	__u16, mode			)
+		__field(unsigned long, dirtied_when		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->state	= inode->i_state;
+		__entry->mode	= inode->i_mode;
+		__entry->dirtied_when = inode->dirtied_when;
+	),
+
+	TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->dirtied_when,
+		  show_inode_state(__entry->state), __entry->mode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_dirty_inode_enqueue,
+
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 3735fa0a6784..9b964a5920af 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -90,6 +90,7 @@ struct inodes_stat_t {
 #define MS_KERNMOUNT	(1<<22) /* this is a kern_mount call */
 #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
+#define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
 
 /* These sb flags are internal to the kernel */
 #define MS_NOSEC	(1<<28)
@@ -100,7 +101,8 @@ struct inodes_stat_t {
 /*
  * Superblock flags that can be altered by MS_REMOUNT
  */
-#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION)
+#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\
+			 MS_LAZYTIME)
 
 /*
  * Old magic mount flag and mask
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0ae0df55000b..915feea94c66 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -69,10 +69,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
-	unsigned long nr_dirty, nr_io, nr_more_io;
+	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
 	struct inode *inode;
 
-	nr_dirty = nr_io = nr_more_io = 0;
+	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
 	spin_lock(&wb->list_lock);
 	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
 		nr_dirty++;
@@ -80,6 +80,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		nr_io++;
 	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
 		nr_more_io++;
+	list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
+		if (inode->i_state & I_DIRTY_TIME)
+			nr_dirty_time++;
 	spin_unlock(&wb->list_lock);
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -98,6 +101,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "b_dirty:            %10lu\n"
 		   "b_io:               %10lu\n"
 		   "b_more_io:          %10lu\n"
+		   "b_dirty_time:       %10lu\n"
 		   "bdi_list:           %10u\n"
 		   "state:              %10lx\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
@@ -111,6 +115,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   nr_dirty,
 		   nr_io,
 		   nr_more_io,
+		   nr_dirty_time,
 		   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
 
@@ -418,6 +423,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&wb->b_dirty);
 	INIT_LIST_HEAD(&wb->b_io);
 	INIT_LIST_HEAD(&wb->b_more_io);
+	INIT_LIST_HEAD(&wb->b_dirty_time);
 	spin_lock_init(&wb->list_lock);
 	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
 }
-- 
cgit v1.2.3


From fe032c422c5ba562ba9c2d316f55e258e03259c6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 2 Feb 2015 00:37:01 -0500
Subject: vfs: add find_inode_nowait() function

Add a new function find_inode_nowait() which is an even more general
version of ilookup5_nowait().  It is designed for callers which need
very fine grained control over when the function is allowed to block
or increment the inode's reference count.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  5 +++++
 2 files changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 4feb85cc125f..740cba79c2b9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1284,6 +1284,56 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
 }
 EXPORT_SYMBOL(ilookup);
 
+/**
+ * find_inode_nowait - find an inode in the inode cache
+ * @sb:		super block of file system to search
+ * @hashval:	hash value (usually inode number) to search for
+ * @match:	callback used for comparisons between inodes
+ * @data:	opaque data pointer to pass to @match
+ *
+ * Search for the inode specified by @hashval and @data in the inode
+ * cache, where the helper function @match will return 0 if the inode
+ * does not match, 1 if the inode does match, and -1 if the search
+ * should be stopped.  The @match function must be responsible for
+ * taking the i_lock spin_lock and checking i_state for an inode being
+ * freed or being initialized, and incrementing the reference count
+ * before returning 1.  It also must not sleep, since it is called with
+ * the inode_hash_lock spinlock held.
+ *
+ * This is a even more generalized version of ilookup5() when the
+ * function must never block --- find_inode() can block in
+ * __wait_on_freeing_inode() --- or when the caller can not increment
+ * the reference count because the resulting iput() might cause an
+ * inode eviction.  The tradeoff is that the @match funtion must be
+ * very carefully implemented.
+ */
+struct inode *find_inode_nowait(struct super_block *sb,
+				unsigned long hashval,
+				int (*match)(struct inode *, unsigned long,
+					     void *),
+				void *data)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *inode, *ret_inode = NULL;
+	int mval;
+
+	spin_lock(&inode_hash_lock);
+	hlist_for_each_entry(inode, head, i_hash) {
+		if (inode->i_sb != sb)
+			continue;
+		mval = match(inode, hashval, data);
+		if (mval == 0)
+			continue;
+		if (mval == 1)
+			ret_inode = inode;
+		goto out;
+	}
+out:
+	spin_unlock(&inode_hash_lock);
+	return ret_inode;
+}
+EXPORT_SYMBOL(find_inode_nowait);
+
 int insert_inode_locked(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cd027ce2c705..5ea8b6e46a3d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2441,6 +2441,11 @@ extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
 
 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
 extern struct inode * iget_locked(struct super_block *, unsigned long);
+extern struct inode *find_inode_nowait(struct super_block *,
+				       unsigned long,
+				       int (*match)(struct inode *,
+						    unsigned long, void *),
+				       void *data);
 extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
 extern int insert_inode_locked(struct inode *);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-- 
cgit v1.2.3


From 1c2b364b225a5a93dbd1f317bd000d2fec2694be Mon Sep 17 00:00:00 2001
From: Tiejun Chen <tiejun.chen@intel.com>
Date: Thu, 5 Feb 2015 17:22:26 +0800
Subject: kvm: remove KVM_MMIO_SIZE

After f78146b0f923, "KVM: Fix page-crossing MMIO", and
87da7e66a405, "KVM: x86: fix vcpu->mmio_fragments overflow",
actually KVM_MMIO_SIZE is gone.

Signed-off-by: Tiejun Chen <tiejun.chen@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 --
 include/linux/kvm_host.h        | 4 ----
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9dbc7435cbc2..848947ac6ade 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -38,8 +38,6 @@
 #define KVM_PRIVATE_MEM_SLOTS 3
 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 
-#define KVM_MMIO_SIZE 16
-
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 32d057571bf6..8a82838034f1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -33,10 +33,6 @@
 
 #include <asm/kvm_host.h>
 
-#ifndef KVM_MMIO_SIZE
-#define KVM_MMIO_SIZE 8
-#endif
-
 /*
  * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
  * in kvm, other bits are visible for userspace which are defined in
-- 
cgit v1.2.3


From 7fbc1067f06098c6b674e672fbb17e758fcc9402 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Oct 2013 10:32:35 +0100
Subject: exportfs: add methods for block layout exports

Add three methods to allow exporting pnfs block layout volumes:

 - get_uuid: get a filesystem unique signature exposed to clients
 - map_blocks: map and if nessecary allocate blocks for a layout
 - commit_blocks: commit blocks in a layout once the client is done with them

For now we stick the external pnfs block layout interfaces into s_export_op to
avoid mixing them up with the internal interface between the NFS server and
the layout drivers.  Once we've fully internalized the latter interface we
can redecide if these methods should stay in s_export_ops.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/exportfs.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 41b223a59a63..fa05e04c5531 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 
 struct dentry;
+struct iattr;
 struct inode;
 struct super_block;
 struct vfsmount;
@@ -180,6 +181,21 @@ struct fid {
  *    get_name is not (which is possibly inconsistent)
  */
 
+/* types of block ranges for multipage write mappings. */
+#define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
+#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
+
+#define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
+
+struct iomap {
+	sector_t	blkno;	/* first sector of mapping */
+	loff_t		offset;	/* file offset of mapping, bytes */
+	u64		length;	/* length of mapping, bytes */
+	int		type;	/* type of mapping */
+};
+
 struct export_operations {
 	int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
 			struct inode *parent);
@@ -191,6 +207,13 @@ struct export_operations {
 			struct dentry *child);
 	struct dentry * (*get_parent)(struct dentry *child);
 	int (*commit_metadata)(struct inode *inode);
+
+	int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
+	int (*map_blocks)(struct inode *inode, loff_t offset,
+			  u64 len, struct iomap *iomap,
+			  bool write, u32 *device_generation);
+	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
+			     int nr_iomaps, struct iattr *iattr);
 };
 
 extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
-- 
cgit v1.2.3


From ddad8dd0a162fde61646a627a3017c258601dc8a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 18 Jan 2015 16:16:29 +0100
Subject: block: use blk_rq_map_user_iov to implement blk_rq_map_user

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c         |  53 +-------------------
 block/blk-map.c     | 137 +++++-----------------------------------------------
 include/linux/bio.h |   4 --
 3 files changed, 14 insertions(+), 180 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 54da51ed43de..879921e6b049 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1102,7 +1102,7 @@ static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_c
  *	bio_uncopy_user	-	finish previously mapped bio
  *	@bio: bio being terminated
  *
- *	Free pages allocated from bio_copy_user() and write back data
+ *	Free pages allocated from bio_copy_user_iov() and write back data
  *	to user space in case of a read.
  */
 int bio_uncopy_user(struct bio *bio)
@@ -1256,32 +1256,6 @@ out_bmd:
 	return ERR_PTR(ret);
 }
 
-/**
- *	bio_copy_user	-	copy user data to bio
- *	@q: destination block queue
- *	@map_data: pointer to the rq_map_data holding pages (if necessary)
- *	@uaddr: start of user address
- *	@len: length in bytes
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
- *
- *	Prepares and returns a bio for indirect user io, bouncing data
- *	to/from kernel pages as necessary. Must be paired with
- *	call bio_uncopy_user() on io completion.
- */
-struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
-			  unsigned long uaddr, unsigned int len,
-			  int write_to_vm, gfp_t gfp_mask)
-{
-	struct sg_iovec iov;
-
-	iov.iov_base = (void __user *)uaddr;
-	iov.iov_len = len;
-
-	return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_copy_user);
-
 static struct bio *__bio_map_user_iov(struct request_queue *q,
 				      struct block_device *bdev,
 				      const struct sg_iovec *iov, int iov_count,
@@ -1394,31 +1368,6 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	return ERR_PTR(ret);
 }
 
-/**
- *	bio_map_user	-	map user address into bio
- *	@q: the struct request_queue for the bio
- *	@bdev: destination block device
- *	@uaddr: start of user address
- *	@len: length in bytes
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
- *
- *	Map the user space address into a bio suitable for io to a block
- *	device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
-			 unsigned long uaddr, unsigned int len, int write_to_vm,
-			 gfp_t gfp_mask)
-{
-	struct sg_iovec iov;
-
-	iov.iov_base = (void __user *)uaddr;
-	iov.iov_len = len;
-
-	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_map_user);
-
 /**
  *	bio_map_user_iov - map user sg_iovec table into bio
  *	@q: the struct request_queue for the bio
diff --git a/block/blk-map.c b/block/blk-map.c
index f890d4345b0c..152a5fe5d85e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -39,130 +39,6 @@ static int __blk_rq_unmap_user(struct bio *bio)
 	return ret;
 }
 
-static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
-			     struct rq_map_data *map_data, void __user *ubuf,
-			     unsigned int len, gfp_t gfp_mask)
-{
-	unsigned long uaddr;
-	struct bio *bio, *orig_bio;
-	int reading, ret;
-
-	reading = rq_data_dir(rq) == READ;
-
-	/*
-	 * if alignment requirement is satisfied, map in user pages for
-	 * direct dma. else, set up kernel bounce buffers
-	 */
-	uaddr = (unsigned long) ubuf;
-	if (blk_rq_aligned(q, uaddr, len) && !map_data)
-		bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
-	else
-		bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
-
-	if (IS_ERR(bio))
-		return PTR_ERR(bio);
-
-	if (map_data && map_data->null_mapped)
-		bio->bi_flags |= (1 << BIO_NULL_MAPPED);
-
-	orig_bio = bio;
-	blk_queue_bounce(q, &bio);
-
-	/*
-	 * We link the bounce buffer in and could have to traverse it
-	 * later so we have to get a ref to prevent it from being freed
-	 */
-	bio_get(bio);
-
-	ret = blk_rq_append_bio(q, rq, bio);
-	if (!ret)
-		return bio->bi_iter.bi_size;
-
-	/* if it was boucned we must call the end io function */
-	bio_endio(bio, 0);
-	__blk_rq_unmap_user(orig_bio);
-	bio_put(bio);
-	return ret;
-}
-
-/**
- * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage
- * @q:		request queue where request should be inserted
- * @rq:		request structure to fill
- * @map_data:   pointer to the rq_map_data holding pages (if necessary)
- * @ubuf:	the user buffer
- * @len:	length of user data
- * @gfp_mask:	memory allocation flags
- *
- * Description:
- *    Data will be mapped directly for zero copy I/O, if possible. Otherwise
- *    a kernel bounce buffer is used.
- *
- *    A matching blk_rq_unmap_user() must be issued at the end of I/O, while
- *    still in process context.
- *
- *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
- *    before being submitted to the device, as pages mapped may be out of
- *    reach. It's the callers responsibility to make sure this happens. The
- *    original bio must be passed back in to blk_rq_unmap_user() for proper
- *    unmapping.
- */
-int blk_rq_map_user(struct request_queue *q, struct request *rq,
-		    struct rq_map_data *map_data, void __user *ubuf,
-		    unsigned long len, gfp_t gfp_mask)
-{
-	unsigned long bytes_read = 0;
-	struct bio *bio = NULL;
-	int ret;
-
-	if (len > (queue_max_hw_sectors(q) << 9))
-		return -EINVAL;
-	if (!len)
-		return -EINVAL;
-
-	if (!ubuf && (!map_data || !map_data->null_mapped))
-		return -EINVAL;
-
-	while (bytes_read != len) {
-		unsigned long map_len, end, start;
-
-		map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);
-		end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)
-								>> PAGE_SHIFT;
-		start = (unsigned long)ubuf >> PAGE_SHIFT;
-
-		/*
-		 * A bad offset could cause us to require BIO_MAX_PAGES + 1
-		 * pages. If this happens we just lower the requested
-		 * mapping len by a page so that we can fit
-		 */
-		if (end - start > BIO_MAX_PAGES)
-			map_len -= PAGE_SIZE;
-
-		ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len,
-					gfp_mask);
-		if (ret < 0)
-			goto unmap_rq;
-		if (!bio)
-			bio = rq->bio;
-		bytes_read += ret;
-		ubuf += ret;
-
-		if (map_data)
-			map_data->offset += ret;
-	}
-
-	if (!bio_flagged(bio, BIO_USER_MAPPED))
-		rq->cmd_flags |= REQ_COPY_USER;
-
-	return 0;
-unmap_rq:
-	blk_rq_unmap_user(bio);
-	rq->bio = NULL;
-	return ret;
-}
-EXPORT_SYMBOL(blk_rq_map_user);
-
 /**
  * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
  * @q:		request queue where request should be inserted
@@ -241,6 +117,19 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
 
+int blk_rq_map_user(struct request_queue *q, struct request *rq,
+		    struct rq_map_data *map_data, void __user *ubuf,
+		    unsigned long len, gfp_t gfp_mask)
+{
+	struct sg_iovec iov;
+
+	iov.iov_base = (void __user *)ubuf;
+	iov.iov_len = len;
+
+	return blk_rq_map_user_iov(q, rq, map_data, &iov, 1, len, gfp_mask);
+}
+EXPORT_SYMBOL(blk_rq_map_user);
+
 /**
  * blk_rq_unmap_user - unmap a request with user data
  * @bio:	       start of bio list
diff --git a/include/linux/bio.h b/include/linux/bio.h
index efead0b532c4..d0d6735d61da 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -428,8 +428,6 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
 extern int bio_get_nr_vecs(struct block_device *);
-extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
-				unsigned long, unsigned int, int, gfp_t);
 struct sg_iovec;
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
@@ -462,8 +460,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 extern void bio_copy_data(struct bio *dst, struct bio *src);
 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 
-extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
-				 unsigned long, unsigned int, int, gfp_t);
 extern struct bio *bio_copy_user_iov(struct request_queue *,
 				     struct rq_map_data *,
 				     const struct sg_iovec *,
-- 
cgit v1.2.3


From 26e49cfc7e988a76bf1e55cef0d9e438e5489180 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sun, 18 Jan 2015 16:16:31 +0100
Subject: block: pass iov_iter to the BLOCK_PC mapping functions

Make use of a new interface provided by iov_iter, backed by
scatter-gather list of iovec, instead of the old interface based on
sg_iovec. Also use iov_iter_advance() instead of manual iteration.

This commit should contain only literal replacements, without
functional changes.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Doug Gilbert <dgilbert@interlog.com>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Signed-off-by: Kent Overstreet <kmo@daterainc.com>
[dpark: add more description in commit message]
Signed-off-by: Dongsu Park <dongsu.park@profitbricks.com>
[hch: fixed to do a deep clone of the iov_iter, and to properly use
      the iov_iter direction]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c            | 146 +++++++++++++++++++++++--------------------------
 block/blk-map.c        |  38 ++++++-------
 block/scsi_ioctl.c     |  17 ++----
 drivers/scsi/sg.c      |  15 ++---
 include/linux/bio.h    |   7 +--
 include/linux/blkdev.h |   4 +-
 6 files changed, 101 insertions(+), 126 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 0895f694f440..7d8c6555e3f3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -28,7 +28,6 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
-#include <scsi/sg.h>		/* for struct sg_iovec */
 
 #include <trace/events/block.h>
 
@@ -1022,21 +1021,11 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 EXPORT_SYMBOL(bio_copy_data);
 
 struct bio_map_data {
-	int nr_sgvecs;
 	int is_our_pages;
-	struct sg_iovec sgvecs[];
+	struct iov_iter iter;
+	struct iovec iov[];
 };
 
-static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
-			     const struct sg_iovec *iov, int iov_count,
-			     int is_our_pages)
-{
-	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
-	bmd->nr_sgvecs = iov_count;
-	bmd->is_our_pages = is_our_pages;
-	bio->bi_private = bmd;
-}
-
 static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
 					       gfp_t gfp_mask)
 {
@@ -1044,36 +1033,33 @@ static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
 		return NULL;
 
 	return kmalloc(sizeof(struct bio_map_data) +
-		       sizeof(struct sg_iovec) * iov_count, gfp_mask);
+		       sizeof(struct iovec) * iov_count, gfp_mask);
 }
 
-static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count,
-                          int to_user, int from_user)
+static int __bio_copy_iov(struct bio *bio, const struct iov_iter *iter,
+			  int to_user, int from_user)
 {
 	int ret = 0, i;
 	struct bio_vec *bvec;
-	int iov_idx = 0;
-	unsigned int iov_off = 0;
+	struct iov_iter iov_iter = *iter;
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		char *bv_addr = page_address(bvec->bv_page);
 		unsigned int bv_len = bvec->bv_len;
 
-		while (bv_len && iov_idx < iov_count) {
-			unsigned int bytes;
-			char __user *iov_addr;
-
-			bytes = min_t(unsigned int,
-				      iov[iov_idx].iov_len - iov_off, bv_len);
-			iov_addr = iov[iov_idx].iov_base + iov_off;
+		while (bv_len && iov_iter.count) {
+			struct iovec iov = iov_iter_iovec(&iov_iter);
+			unsigned int bytes = min_t(unsigned int, bv_len,
+						   iov.iov_len);
 
 			if (!ret) {
 				if (to_user)
-					ret = copy_to_user(iov_addr, bv_addr,
-							   bytes);
+					ret = copy_to_user(iov.iov_base,
+							   bv_addr, bytes);
 
 				if (from_user)
-					ret = copy_from_user(bv_addr, iov_addr,
+					ret = copy_from_user(bv_addr,
+							     iov.iov_base,
 							     bytes);
 
 				if (ret)
@@ -1082,13 +1068,7 @@ static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_c
 
 			bv_len -= bytes;
 			bv_addr += bytes;
-			iov_addr += bytes;
-			iov_off += bytes;
-
-			if (iov[iov_idx].iov_len == iov_off) {
-				iov_idx++;
-				iov_off = 0;
-			}
+			iov_iter_advance(&iov_iter, bytes);
 		}
 	}
 
@@ -1122,7 +1102,7 @@ int bio_uncopy_user(struct bio *bio)
 		 * don't copy into a random user address space, just free.
 		 */
 		if (current->mm)
-			ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs,
+			ret = __bio_copy_iov(bio, &bmd->iter,
 					     bio_data_dir(bio) == READ, 0);
 		if (bmd->is_our_pages)
 			bio_free_pages(bio);
@@ -1135,12 +1115,10 @@ EXPORT_SYMBOL(bio_uncopy_user);
 
 /**
  *	bio_copy_user_iov	-	copy user data to bio
- *	@q: destination block queue
- *	@map_data: pointer to the rq_map_data holding pages (if necessary)
- *	@iov:	the iovec.
- *	@iov_count: number of elements in the iovec
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
+ *	@q:		destination block queue
+ *	@map_data:	pointer to the rq_map_data holding pages (if necessary)
+ *	@iter:		iovec iterator
+ *	@gfp_mask:	memory allocation flags
  *
  *	Prepares and returns a bio for indirect user io, bouncing data
  *	to/from kernel pages as necessary. Must be paired with
@@ -1148,24 +1126,25 @@ EXPORT_SYMBOL(bio_uncopy_user);
  */
 struct bio *bio_copy_user_iov(struct request_queue *q,
 			      struct rq_map_data *map_data,
-			      const struct sg_iovec *iov, int iov_count,
-			      int write_to_vm, gfp_t gfp_mask)
+			      const struct iov_iter *iter,
+			      gfp_t gfp_mask)
 {
 	struct bio_map_data *bmd;
 	struct page *page;
 	struct bio *bio;
 	int i, ret;
 	int nr_pages = 0;
-	unsigned int len = 0;
+	unsigned int len = iter->count;
 	unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
 
-	for (i = 0; i < iov_count; i++) {
+	for (i = 0; i < iter->nr_segs; i++) {
 		unsigned long uaddr;
 		unsigned long end;
 		unsigned long start;
 
-		uaddr = (unsigned long)iov[i].iov_base;
-		end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		uaddr = (unsigned long) iter->iov[i].iov_base;
+		end = (uaddr + iter->iov[i].iov_len + PAGE_SIZE - 1)
+			>> PAGE_SHIFT;
 		start = uaddr >> PAGE_SHIFT;
 
 		/*
@@ -1175,22 +1154,31 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 			return ERR_PTR(-EINVAL);
 
 		nr_pages += end - start;
-		len += iov[i].iov_len;
 	}
 
 	if (offset)
 		nr_pages++;
 
-	bmd = bio_alloc_map_data(iov_count, gfp_mask);
+	bmd = bio_alloc_map_data(iter->nr_segs, gfp_mask);
 	if (!bmd)
 		return ERR_PTR(-ENOMEM);
 
+	/*
+	 * We need to do a deep copy of the iov_iter including the iovecs.
+	 * The caller provided iov might point to an on-stack or otherwise
+	 * shortlived one.
+	 */
+	bmd->is_our_pages = map_data ? 0 : 1;
+	memcpy(bmd->iov, iter->iov, sizeof(struct iovec) * iter->nr_segs);
+	iov_iter_init(&bmd->iter, iter->type, bmd->iov,
+			iter->nr_segs, iter->count);
+
 	ret = -ENOMEM;
 	bio = bio_kmalloc(gfp_mask, nr_pages);
 	if (!bio)
 		goto out_bmd;
 
-	if (!write_to_vm)
+	if (iter->type & WRITE)
 		bio->bi_rw |= REQ_WRITE;
 
 	ret = 0;
@@ -1238,14 +1226,14 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	/*
 	 * success
 	 */
-	if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
+	if (((iter->type & WRITE) && (!map_data || !map_data->null_mapped)) ||
 	    (map_data && map_data->from_user)) {
-		ret = __bio_copy_iov(bio, iov, iov_count, 0, 1);
+		ret = __bio_copy_iov(bio, iter, 0, 1);
 		if (ret)
 			goto cleanup;
 	}
 
-	bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
+	bio->bi_private = bmd;
 	return bio;
 cleanup:
 	if (!map_data)
@@ -1258,19 +1246,21 @@ out_bmd:
 
 static struct bio *__bio_map_user_iov(struct request_queue *q,
 				      struct block_device *bdev,
-				      const struct sg_iovec *iov, int iov_count,
-				      int write_to_vm, gfp_t gfp_mask)
+				      const struct iov_iter *iter,
+				      gfp_t gfp_mask)
 {
-	int i, j;
+	int j;
 	int nr_pages = 0;
 	struct page **pages;
 	struct bio *bio;
 	int cur_page = 0;
 	int ret, offset;
+	struct iov_iter i;
+	struct iovec iov;
 
-	for (i = 0; i < iov_count; i++) {
-		unsigned long uaddr = (unsigned long)iov[i].iov_base;
-		unsigned long len = iov[i].iov_len;
+	iov_for_each(iov, i, *iter) {
+		unsigned long uaddr = (unsigned long) iov.iov_base;
+		unsigned long len = iov.iov_len;
 		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		unsigned long start = uaddr >> PAGE_SHIFT;
 
@@ -1300,16 +1290,17 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	if (!pages)
 		goto out;
 
-	for (i = 0; i < iov_count; i++) {
-		unsigned long uaddr = (unsigned long)iov[i].iov_base;
-		unsigned long len = iov[i].iov_len;
+	iov_for_each(iov, i, *iter) {
+		unsigned long uaddr = (unsigned long) iov.iov_base;
+		unsigned long len = iov.iov_len;
 		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		unsigned long start = uaddr >> PAGE_SHIFT;
 		const int local_nr_pages = end - start;
 		const int page_limit = cur_page + local_nr_pages;
 
 		ret = get_user_pages_fast(uaddr, local_nr_pages,
-				write_to_vm, &pages[cur_page]);
+				(iter->type & WRITE) != WRITE,
+				&pages[cur_page]);
 		if (ret < local_nr_pages) {
 			ret = -EFAULT;
 			goto out_unmap;
@@ -1349,7 +1340,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	/*
 	 * set data direction, and check if mapped pages need bouncing
 	 */
-	if (!write_to_vm)
+	if (iter->type & WRITE)
 		bio->bi_rw |= REQ_WRITE;
 
 	bio->bi_bdev = bdev;
@@ -1357,10 +1348,10 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	return bio;
 
  out_unmap:
-	for (i = 0; i < nr_pages; i++) {
-		if(!pages[i])
+	for (j = 0; j < nr_pages; j++) {
+		if (!pages[j])
 			break;
-		page_cache_release(pages[i]);
+		page_cache_release(pages[j]);
 	}
  out:
 	kfree(pages);
@@ -1369,25 +1360,22 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 }
 
 /**
- *	bio_map_user_iov - map user sg_iovec table into bio
- *	@q: the struct request_queue for the bio
- *	@bdev: destination block device
- *	@iov:	the iovec.
- *	@iov_count: number of elements in the iovec
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
+ *	bio_map_user_iov - map user iovec into bio
+ *	@q:		the struct request_queue for the bio
+ *	@bdev:		destination block device
+ *	@iter:		iovec iterator
+ *	@gfp_mask:	memory allocation flags
  *
  *	Map the user space address into a bio suitable for io to a block
  *	device. Returns an error pointer in case of error.
  */
 struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
-			     const struct sg_iovec *iov, int iov_count,
-			     int write_to_vm, gfp_t gfp_mask)
+			     const struct iov_iter *iter,
+			     gfp_t gfp_mask)
 {
 	struct bio *bio;
 
-	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
-				 gfp_mask);
+	bio = __bio_map_user_iov(q, bdev, iter, gfp_mask);
 	if (IS_ERR(bio))
 		return bio;
 
diff --git a/block/blk-map.c b/block/blk-map.c
index 152a5fe5d85e..30e6bb871c5c 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -5,7 +5,7 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <scsi/sg.h>		/* for struct sg_iovec */
+#include <linux/uio.h>
 
 #include "blk.h"
 
@@ -44,9 +44,7 @@ static int __blk_rq_unmap_user(struct bio *bio)
  * @q:		request queue where request should be inserted
  * @rq:		request to map data to
  * @map_data:   pointer to the rq_map_data holding pages (if necessary)
- * @iov:	pointer to the iovec
- * @iov_count:	number of elements in the iovec
- * @len:	I/O byte count
+ * @iter:	iovec iterator
  * @gfp_mask:	memory allocation flags
  *
  * Description:
@@ -63,20 +61,21 @@ static int __blk_rq_unmap_user(struct bio *bio)
  *    unmapping.
  */
 int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
-			struct rq_map_data *map_data, const struct sg_iovec *iov,
-			int iov_count, unsigned int len, gfp_t gfp_mask)
+			struct rq_map_data *map_data,
+			const struct iov_iter *iter, gfp_t gfp_mask)
 {
 	struct bio *bio;
-	int i, read = rq_data_dir(rq) == READ;
 	int unaligned = 0;
+	struct iov_iter i;
+	struct iovec iov;
 
-	if (!iov || iov_count <= 0)
+	if (!iter || !iter->count)
 		return -EINVAL;
 
-	for (i = 0; i < iov_count; i++) {
-		unsigned long uaddr = (unsigned long)iov[i].iov_base;
+	iov_for_each(iov, i, *iter) {
+		unsigned long uaddr = (unsigned long) iov.iov_base;
 
-		if (!iov[i].iov_len)
+		if (!iov.iov_len)
 			return -EINVAL;
 
 		/*
@@ -86,16 +85,15 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 			unaligned = 1;
 	}
 
-	if (unaligned || (q->dma_pad_mask & len) || map_data)
-		bio = bio_copy_user_iov(q, map_data, iov, iov_count, read,
-					gfp_mask);
+	if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
+		bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
 	else
-		bio = bio_map_user_iov(q, NULL, iov, iov_count, read, gfp_mask);
+		bio = bio_map_user_iov(q, NULL, iter, gfp_mask);
 
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
-	if (bio->bi_iter.bi_size != len) {
+	if (bio->bi_iter.bi_size != iter->count) {
 		/*
 		 * Grab an extra reference to this bio, as bio_unmap_user()
 		 * expects to be able to drop it twice as it happens on the
@@ -121,12 +119,14 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 		    struct rq_map_data *map_data, void __user *ubuf,
 		    unsigned long len, gfp_t gfp_mask)
 {
-	struct sg_iovec iov;
+	struct iovec iov;
+	struct iov_iter i;
 
-	iov.iov_base = (void __user *)ubuf;
+	iov.iov_base = ubuf;
 	iov.iov_len = len;
+	iov_iter_init(&i, rq_data_dir(rq), &iov, 1, len);
 
-	return blk_rq_map_user_iov(q, rq, map_data, &iov, 1, len, gfp_mask);
+	return blk_rq_map_user_iov(q, rq, map_data, &i, gfp_mask);
 }
 EXPORT_SYMBOL(blk_rq_map_user);
 
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 28163fad3c5d..e1f71c396193 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -332,7 +332,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 
 	ret = 0;
 	if (hdr->iovec_count) {
-		size_t iov_data_len;
+		struct iov_iter i;
 		struct iovec *iov = NULL;
 
 		ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count,
@@ -342,20 +342,11 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 			goto out_free_cdb;
 		}
 
-		iov_data_len = ret;
-		ret = 0;
-
 		/* SG_IO howto says that the shorter of the two wins */
-		if (hdr->dxfer_len < iov_data_len) {
-			hdr->iovec_count = iov_shorten(iov,
-						       hdr->iovec_count,
-						       hdr->dxfer_len);
-			iov_data_len = hdr->dxfer_len;
-		}
+		iov_iter_init(&i, rq_data_dir(rq), iov, hdr->iovec_count,
+			      min_t(unsigned, ret, hdr->dxfer_len));
 
-		ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov,
-					  hdr->iovec_count,
-					  iov_data_len, GFP_KERNEL);
+		ret = blk_rq_map_user_iov(q, rq, NULL, &i, GFP_KERNEL);
 		kfree(iov);
 	} else if (hdr->dxfer_len)
 		ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index b14f64cb9724..4052e592818c 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1734,22 +1734,19 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
 	}
 
 	if (iov_count) {
-		int len, size = sizeof(struct sg_iovec) * iov_count;
+		int size = sizeof(struct iovec) * iov_count;
 		struct iovec *iov;
+		struct iov_iter i;
 
 		iov = memdup_user(hp->dxferp, size);
 		if (IS_ERR(iov))
 			return PTR_ERR(iov);
 
-		len = iov_length(iov, iov_count);
-		if (hp->dxfer_len < len) {
-			iov_count = iov_shorten(iov, iov_count, hp->dxfer_len);
-			len = hp->dxfer_len;
-		}
+		iov_iter_init(&i, rw, iov, iov_count,
+			      min_t(size_t, hp->dxfer_len,
+				    iov_length(iov, iov_count)));
 
-		res = blk_rq_map_user_iov(q, rq, md, (struct sg_iovec *)iov,
-					  iov_count,
-					  len, GFP_ATOMIC);
+		res = blk_rq_map_user_iov(q, rq, md, &i, GFP_ATOMIC);
 		kfree(iov);
 	} else
 		res = blk_rq_map_user(q, rq, md, hp->dxferp,
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d0d6735d61da..0d6105b34ffa 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -428,11 +428,10 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
 extern int bio_get_nr_vecs(struct block_device *);
-struct sg_iovec;
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
 				    struct block_device *,
-				    const struct sg_iovec *, int, int, gfp_t);
+				    const struct iov_iter *, gfp_t);
 extern void bio_unmap_user(struct bio *);
 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
 				gfp_t);
@@ -462,8 +461,8 @@ extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 
 extern struct bio *bio_copy_user_iov(struct request_queue *,
 				     struct rq_map_data *,
-				     const struct sg_iovec *,
-				     int, int, gfp_t);
+				     const struct iov_iter *,
+				     gfp_t);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
 extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 13e16401a7ce..bf4ef666d191 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -855,8 +855,8 @@ extern int blk_rq_map_user(struct request_queue *, struct request *,
 extern int blk_rq_unmap_user(struct bio *);
 extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
-			       struct rq_map_data *, const struct sg_iovec *,
-			       int, unsigned int, gfp_t);
+			       struct rq_map_data *, const struct iov_iter *,
+			       gfp_t);
 extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
-- 
cgit v1.2.3


From 37f19e57a0de3c4a3417aa13ff4d04f1e0dee4b3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 18 Jan 2015 16:16:33 +0100
Subject: block: merge __bio_map_user_iov into bio_map_user_iov

And also remove the unused bdev argument.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c         | 56 +++++++++++++++++++----------------------------------
 block/blk-map.c     |  2 +-
 include/linux/bio.h |  1 -
 3 files changed, 21 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index a69a9c9e7c93..0723d4ce8589 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1244,10 +1244,18 @@ out_bmd:
 	return ERR_PTR(ret);
 }
 
-static struct bio *__bio_map_user_iov(struct request_queue *q,
-				      struct block_device *bdev,
-				      const struct iov_iter *iter,
-				      gfp_t gfp_mask)
+/**
+ *	bio_map_user_iov - map user iovec into bio
+ *	@q:		the struct request_queue for the bio
+ *	@iter:		iovec iterator
+ *	@gfp_mask:	memory allocation flags
+ *
+ *	Map the user space address into a bio suitable for io to a block
+ *	device. Returns an error pointer in case of error.
+ */
+struct bio *bio_map_user_iov(struct request_queue *q,
+			     const struct iov_iter *iter,
+			     gfp_t gfp_mask)
 {
 	int j;
 	int nr_pages = 0;
@@ -1343,8 +1351,15 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	if (iter->type & WRITE)
 		bio->bi_rw |= REQ_WRITE;
 
-	bio->bi_bdev = bdev;
 	bio->bi_flags |= (1 << BIO_USER_MAPPED);
+
+	/*
+	 * subtle -- if __bio_map_user() ended up bouncing a bio,
+	 * it would normally disappear when its bi_end_io is run.
+	 * however, we need it for the unmap, so grab an extra
+	 * reference to it
+	 */
+	bio_get(bio);
 	return bio;
 
  out_unmap:
@@ -1359,37 +1374,6 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	return ERR_PTR(ret);
 }
 
-/**
- *	bio_map_user_iov - map user iovec into bio
- *	@q:		the struct request_queue for the bio
- *	@bdev:		destination block device
- *	@iter:		iovec iterator
- *	@gfp_mask:	memory allocation flags
- *
- *	Map the user space address into a bio suitable for io to a block
- *	device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
-			     const struct iov_iter *iter,
-			     gfp_t gfp_mask)
-{
-	struct bio *bio;
-
-	bio = __bio_map_user_iov(q, bdev, iter, gfp_mask);
-	if (IS_ERR(bio))
-		return bio;
-
-	/*
-	 * subtle -- if __bio_map_user() ended up bouncing a bio,
-	 * it would normally disappear when its bi_end_io is run.
-	 * however, we need it for the unmap, so grab an extra
-	 * reference to it
-	 */
-	bio_get(bio);
-
-	return bio;
-}
-
 static void __bio_unmap_user(struct bio *bio)
 {
 	struct bio_vec *bvec;
diff --git a/block/blk-map.c b/block/blk-map.c
index 30e6bb871c5c..0f22911f17dc 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -88,7 +88,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
 		bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
 	else
-		bio = bio_map_user_iov(q, NULL, iter, gfp_mask);
+		bio = bio_map_user_iov(q, iter, gfp_mask);
 
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 0d6105b34ffa..da3a127c9958 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -430,7 +430,6 @@ extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 extern int bio_get_nr_vecs(struct block_device *);
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
-				    struct block_device *,
 				    const struct iov_iter *, gfp_t);
 extern void bio_unmap_user(struct bio *);
 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
-- 
cgit v1.2.3


From dbf9782c1078c537831201c73ac60c9623ae9370 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 16 Dec 2014 18:44:36 -0500
Subject: dm: remove exports for request-based interfaces without external
 callers

Remove exports for dm_dispatch_request, dm_requeue_unmapped_request,
and dm_kill_unmapped_request.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c               | 9 +++------
 include/linux/device-mapper.h | 3 ---
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 71e6b73fe78d..5920a9af7bd6 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1062,7 +1062,7 @@ static void dm_unprep_request(struct request *rq)
 /*
  * Requeue the original request of a clone.
  */
-void dm_requeue_unmapped_request(struct request *clone)
+static void dm_requeue_unmapped_request(struct request *clone)
 {
 	int rw = rq_data_dir(clone);
 	struct dm_rq_target_io *tio = clone->end_io_data;
@@ -1079,7 +1079,6 @@ void dm_requeue_unmapped_request(struct request *clone)
 
 	rq_completed(md, rw, 0);
 }
-EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
 
 static void __stop_queue(struct request_queue *q)
 {
@@ -1177,7 +1176,7 @@ static void dm_complete_request(struct request *clone, int error)
  * Target's rq_end_io() function isn't called.
  * This may be used when the target's map_rq() function fails.
  */
-void dm_kill_unmapped_request(struct request *clone, int error)
+static void dm_kill_unmapped_request(struct request *clone, int error)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct request *rq = tio->orig;
@@ -1185,7 +1184,6 @@ void dm_kill_unmapped_request(struct request *clone, int error)
 	rq->cmd_flags |= REQ_FAILED;
 	dm_complete_request(clone, error);
 }
-EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
 
 /*
  * Called with the queue lock held
@@ -1686,7 +1684,7 @@ static void dm_request(struct request_queue *q, struct bio *bio)
 		_dm_request(q, bio);
 }
 
-void dm_dispatch_request(struct request *rq)
+static void dm_dispatch_request(struct request *rq)
 {
 	int r;
 
@@ -1698,7 +1696,6 @@ void dm_dispatch_request(struct request *rq)
 	if (r)
 		dm_complete_request(rq, r);
 }
-EXPORT_SYMBOL_GPL(dm_dispatch_request);
 
 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
 				 void *data)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ca6d2acc5eb7..19296fba58e8 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -600,9 +600,6 @@ static inline unsigned long to_bytes(sector_t n)
 /*-----------------------------------------------------------------
  * Helper for block layer and dm core operations
  *---------------------------------------------------------------*/
-void dm_dispatch_request(struct request *rq);
-void dm_requeue_unmapped_request(struct request *rq);
-void dm_kill_unmapped_request(struct request *rq, int error);
 int dm_underlying_device_busy(struct request_queue *q);
 
 #endif	/* _LINUX_DEVICE_MAPPER_H */
-- 
cgit v1.2.3


From e5863d9ad754926e7d3f38b43ac8bd48ef73b097 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 17 Dec 2014 21:08:12 -0500
Subject: dm: allocate requests in target when stacking on blk-mq devices

For blk-mq request-based DM the responsibility of allocating a cloned
request is transfered from DM core to the target type.  Doing so
enables the cloned request to be allocated from the appropriate
blk-mq request_queue's pool (only the DM target, e.g. multipath, can
know which block device to send a given cloned request to).

Care was taken to preserve compatibility with old-style block request
completion that requires request-based DM _not_ acquire the clone
request's queue lock in the completion path.  As such, there are now 2
different request-based DM target_type interfaces:
1) the original .map_rq() interface will continue to be used for
   non-blk-mq devices -- the preallocated clone request is passed in
   from DM core.
2) a new .clone_and_map_rq() and .release_clone_rq() will be used for
   blk-mq devices -- blk_get_request() and blk_put_request() are used
   respectively from these hooks.

dm_table_set_type() was updated to detect if the request-based target is
being stacked on blk-mq devices, if so DM_TYPE_MQ_REQUEST_BASED is set.
DM core disallows switching the DM table's type after it is set.  This
means that there is no mixing of non-blk-mq and blk-mq devices within
the same request-based DM table.

[This patch was started by Keith and later heavily modified by Mike]

Tested-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c         |  51 ++++++++++++++++---
 drivers/md/dm-table.c         |  34 +++++++++++--
 drivers/md/dm-target.c        |  15 +++++-
 drivers/md/dm.c               | 114 +++++++++++++++++++++++++++++++-----------
 drivers/md/dm.h               |   8 +--
 include/linux/device-mapper.h |   7 +++
 include/uapi/linux/dm-ioctl.h |   4 +-
 7 files changed, 185 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 2552b88f8953..863fc8c1ac06 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -11,6 +11,7 @@
 #include "dm-path-selector.h"
 #include "dm-uevent.h"
 
+#include <linux/blkdev.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/mempool.h>
@@ -378,12 +379,13 @@ static int __must_push_back(struct multipath *m)
 /*
  * Map cloned requests
  */
-static int multipath_map(struct dm_target *ti, struct request *clone,
-			 union map_info *map_context)
+static int __multipath_map(struct dm_target *ti, struct request *clone,
+			   union map_info *map_context,
+			   struct request *rq, struct request **__clone)
 {
 	struct multipath *m = (struct multipath *) ti->private;
 	int r = DM_MAPIO_REQUEUE;
-	size_t nr_bytes = blk_rq_bytes(clone);
+	size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
 	struct pgpath *pgpath;
 	struct block_device *bdev;
 	struct dm_mpath_io *mpio;
@@ -416,12 +418,25 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
 
 	bdev = pgpath->path.dev->bdev;
 
-	clone->q = bdev_get_queue(bdev);
-	clone->rq_disk = bdev->bd_disk;
-	clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-
 	spin_unlock_irq(&m->lock);
 
+	if (clone) {
+		/* Old request-based interface: allocated clone is passed in */
+		clone->q = bdev_get_queue(bdev);
+		clone->rq_disk = bdev->bd_disk;
+		clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+	} else {
+		/* blk-mq request-based interface */
+		*__clone = blk_get_request(bdev_get_queue(bdev),
+					   rq_data_dir(rq), GFP_KERNEL);
+		if (IS_ERR(*__clone))
+			/* ENOMEM, requeue */
+			return r;
+		(*__clone)->bio = (*__clone)->biotail = NULL;
+		(*__clone)->rq_disk = bdev->bd_disk;
+		(*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+	}
+
 	if (pgpath->pg->ps.type->start_io)
 		pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
 					      &pgpath->path,
@@ -434,6 +449,24 @@ out_unlock:
 	return r;
 }
 
+static int multipath_map(struct dm_target *ti, struct request *clone,
+			 union map_info *map_context)
+{
+	return __multipath_map(ti, clone, map_context, NULL, NULL);
+}
+
+static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
+				   union map_info *map_context,
+				   struct request **clone)
+{
+	return __multipath_map(ti, NULL, map_context, rq, clone);
+}
+
+static void multipath_release_clone(struct request *clone)
+{
+	blk_put_request(clone);
+}
+
 /*
  * If we run out of usable paths, should we queue I/O or error it?
  */
@@ -1670,11 +1703,13 @@ out:
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 7, 0},
+	.version = {1, 8, 0},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
 	.map_rq = multipath_map,
+	.clone_and_map_rq = multipath_clone_and_map,
+	.release_clone_rq = multipath_release_clone,
 	.rq_end_io = multipath_end_io,
 	.presuspend = multipath_presuspend,
 	.postsuspend = multipath_postsuspend,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3afae9e062f8..2d7e373955f3 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -827,6 +827,7 @@ static int dm_table_set_type(struct dm_table *t)
 {
 	unsigned i;
 	unsigned bio_based = 0, request_based = 0, hybrid = 0;
+	bool use_blk_mq = false;
 	struct dm_target *tgt;
 	struct dm_dev_internal *dd;
 	struct list_head *devices;
@@ -872,11 +873,26 @@ static int dm_table_set_type(struct dm_table *t)
 	/* Non-request-stackable devices can't be used for request-based dm */
 	devices = dm_table_get_devices(t);
 	list_for_each_entry(dd, devices, list) {
-		if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev->bdev))) {
-			DMWARN("table load rejected: including"
-			       " non-request-stackable devices");
+		struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
+
+		if (!blk_queue_stackable(q)) {
+			DMERR("table load rejected: including"
+			      " non-request-stackable devices");
 			return -EINVAL;
 		}
+
+		if (q->mq_ops)
+			use_blk_mq = true;
+	}
+
+	if (use_blk_mq) {
+		/* verify _all_ devices in the table are blk-mq devices */
+		list_for_each_entry(dd, devices, list)
+			if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) {
+				DMERR("table load rejected: not all devices"
+				      " are blk-mq request-stackable");
+				return -EINVAL;
+			}
 	}
 
 	/*
@@ -890,7 +906,7 @@ static int dm_table_set_type(struct dm_table *t)
 		return -EINVAL;
 	}
 
-	t->type = DM_TYPE_REQUEST_BASED;
+	t->type = !use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
 
 	return 0;
 }
@@ -907,7 +923,15 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
 
 bool dm_table_request_based(struct dm_table *t)
 {
-	return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
+	unsigned table_type = dm_table_get_type(t);
+
+	return (table_type == DM_TYPE_REQUEST_BASED ||
+		table_type == DM_TYPE_MQ_REQUEST_BASED);
+}
+
+bool dm_table_mq_request_based(struct dm_table *t)
+{
+	return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
 }
 
 static int dm_table_alloc_md_mempools(struct dm_table *t)
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 242e3cec397a..925ec1b15e75 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -137,13 +137,26 @@ static int io_err_map_rq(struct dm_target *ti, struct request *clone,
 	return -EIO;
 }
 
+static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
+				   union map_info *map_context,
+				   struct request **clone)
+{
+	return -EIO;
+}
+
+static void io_err_release_clone_rq(struct request *clone)
+{
+}
+
 static struct target_type error_target = {
 	.name = "error",
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.ctr  = io_err_ctr,
 	.dtr  = io_err_dtr,
 	.map  = io_err_map,
 	.map_rq = io_err_map_rq,
+	.clone_and_map_rq = io_err_clone_and_map_rq,
+	.release_clone_rq = io_err_release_clone_rq,
 };
 
 int __init dm_target_init(void)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ae1219893948..549b815999a1 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1044,7 +1044,10 @@ static void free_rq_clone(struct request *clone)
 	struct dm_rq_target_io *tio = clone->end_io_data;
 
 	blk_rq_unprep_clone(clone);
-	free_clone_request(tio->md, clone);
+	if (clone->q && clone->q->mq_ops)
+		tio->ti->type->release_clone_rq(clone);
+	else
+		free_clone_request(tio->md, clone);
 	free_rq_tio(tio);
 }
 
@@ -1086,7 +1089,8 @@ static void dm_unprep_request(struct request *rq)
 	rq->special = NULL;
 	rq->cmd_flags &= ~REQ_DONTPREP;
 
-	free_rq_clone(clone);
+	if (clone)
+		free_rq_clone(clone);
 }
 
 /*
@@ -1185,6 +1189,13 @@ static void dm_softirq_done(struct request *rq)
 	struct dm_rq_target_io *tio = rq->special;
 	struct request *clone = tio->clone;
 
+	if (!clone) {
+		blk_end_request_all(rq, tio->error);
+		rq_completed(tio->md, rq_data_dir(rq), false);
+		free_rq_tio(tio);
+		return;
+	}
+
 	if (rq->cmd_flags & REQ_FAILED)
 		mapped = false;
 
@@ -1207,7 +1218,7 @@ static void dm_complete_request(struct request *rq, int error)
  * Complete the not-mapped clone and the original request with the error status
  * through softirq context.
  * Target's rq_end_io() function isn't called.
- * This may be used when the target's map_rq() function fails.
+ * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
  */
 static void dm_kill_unmapped_request(struct request *rq, int error)
 {
@@ -1222,13 +1233,15 @@ static void end_clone_request(struct request *clone, int error)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 
-	/*
-	 * For just cleaning up the information of the queue in which
-	 * the clone was dispatched.
-	 * The clone is *NOT* freed actually here because it is alloced from
-	 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
-	 */
-	__blk_put_request(clone->q, clone);
+	if (!clone->q->mq_ops) {
+		/*
+		 * For just cleaning up the information of the queue in which
+		 * the clone was dispatched.
+		 * The clone is *NOT* freed actually here because it is alloced
+		 * from dm own mempool (REQ_ALLOCED isn't set).
+		 */
+		__blk_put_request(clone->q, clone);
+	}
 
 	/*
 	 * Actual request completion is done in a softirq context which doesn't
@@ -1789,6 +1802,8 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
 					struct mapped_device *md, gfp_t gfp_mask)
 {
 	struct dm_rq_target_io *tio;
+	int srcu_idx;
+	struct dm_table *table;
 
 	tio = alloc_rq_tio(md, gfp_mask);
 	if (!tio)
@@ -1802,10 +1817,15 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
 	memset(&tio->info, 0, sizeof(tio->info));
 	init_kthread_work(&tio->work, map_tio_request);
 
-	if (!clone_rq(rq, md, tio, gfp_mask)) {
-		free_rq_tio(tio);
-		return NULL;
+	table = dm_get_live_table(md, &srcu_idx);
+	if (!dm_table_mq_request_based(table)) {
+		if (!clone_rq(rq, md, tio, gfp_mask)) {
+			dm_put_live_table(md, srcu_idx);
+			free_rq_tio(tio);
+			return NULL;
+		}
 	}
+	dm_put_live_table(md, srcu_idx);
 
 	return tio;
 }
@@ -1835,17 +1855,36 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
 
 /*
  * Returns:
- * 0  : the request has been processed (not requeued)
- * !0 : the request has been requeued
+ * 0                : the request has been processed
+ * DM_MAPIO_REQUEUE : the original request needs to be requeued
+ * < 0              : the request was completed due to failure
  */
 static int map_request(struct dm_target *ti, struct request *rq,
 		       struct mapped_device *md)
 {
-	int r, requeued = 0;
+	int r;
 	struct dm_rq_target_io *tio = rq->special;
-	struct request *clone = tio->clone;
+	struct request *clone = NULL;
+
+	if (tio->clone) {
+		clone = tio->clone;
+		r = ti->type->map_rq(ti, clone, &tio->info);
+	} else {
+		r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+		if (r < 0) {
+			/* The target wants to complete the I/O */
+			dm_kill_unmapped_request(rq, r);
+			return r;
+		}
+		if (IS_ERR(clone))
+			return DM_MAPIO_REQUEUE;
+		if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
+			/* -ENOMEM */
+			ti->type->release_clone_rq(clone);
+			return DM_MAPIO_REQUEUE;
+		}
+	}
 
-	r = ti->type->map_rq(ti, clone, &tio->info);
 	switch (r) {
 	case DM_MAPIO_SUBMITTED:
 		/* The target has taken the I/O to submit by itself later */
@@ -1859,7 +1898,6 @@ static int map_request(struct dm_target *ti, struct request *rq,
 	case DM_MAPIO_REQUEUE:
 		/* The target wants to requeue the I/O */
 		dm_requeue_unmapped_request(clone);
-		requeued = 1;
 		break;
 	default:
 		if (r > 0) {
@@ -1869,17 +1907,20 @@ static int map_request(struct dm_target *ti, struct request *rq,
 
 		/* The target wants to complete the I/O */
 		dm_kill_unmapped_request(rq, r);
-		break;
+		return r;
 	}
 
-	return requeued;
+	return 0;
 }
 
 static void map_tio_request(struct kthread_work *work)
 {
 	struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
+	struct request *rq = tio->orig;
+	struct mapped_device *md = tio->md;
 
-	map_request(tio->ti, tio->orig, tio->md);
+	if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
+		dm_requeue_unmapped_original_request(md, rq);
 }
 
 static void dm_start_request(struct mapped_device *md, struct request *orig)
@@ -2459,6 +2500,14 @@ unsigned dm_get_md_type(struct mapped_device *md)
 	return md->type;
 }
 
+static bool dm_md_type_request_based(struct mapped_device *md)
+{
+	unsigned table_type = dm_get_md_type(md);
+
+	return (table_type == DM_TYPE_REQUEST_BASED ||
+		table_type == DM_TYPE_MQ_REQUEST_BASED);
+}
+
 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
 {
 	return md->immutable_target_type;
@@ -2511,8 +2560,7 @@ static int dm_init_request_based_queue(struct mapped_device *md)
  */
 int dm_setup_md_queue(struct mapped_device *md)
 {
-	if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
-	    !dm_init_request_based_queue(md)) {
+	if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) {
 		DMWARN("Cannot initialize queue for request-based mapped device");
 		return -EINVAL;
 	}
@@ -3184,27 +3232,35 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 {
 	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	struct kmem_cache *cachep;
-	unsigned int pool_size;
+	unsigned int pool_size = 0;
 	unsigned int front_pad;
 
 	if (!pools)
 		return NULL;
 
-	if (type == DM_TYPE_BIO_BASED) {
+	switch (type) {
+	case DM_TYPE_BIO_BASED:
 		cachep = _io_cache;
 		pool_size = dm_get_reserved_bio_based_ios();
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
-	} else if (type == DM_TYPE_REQUEST_BASED) {
-		cachep = _rq_tio_cache;
+		break;
+	case DM_TYPE_REQUEST_BASED:
 		pool_size = dm_get_reserved_rq_based_ios();
 		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
 		if (!pools->rq_pool)
 			goto out;
+		/* fall through to setup remaining rq-based pools */
+	case DM_TYPE_MQ_REQUEST_BASED:
+		cachep = _rq_tio_cache;
+		if (!pool_size)
+			pool_size = dm_get_reserved_rq_based_ios();
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_bio_data_size is not used. See __bind_mempools(). */
 		WARN_ON(per_bio_data_size != 0);
-	} else
+		break;
+	default:
 		goto out;
+	}
 
 	pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
 	if (!pools->io_pool)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 84b0f9e4ba6c..84d79784b866 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -34,9 +34,10 @@
 /*
  * Type of table and mapped_device's mempool
  */
-#define DM_TYPE_NONE		0
-#define DM_TYPE_BIO_BASED	1
-#define DM_TYPE_REQUEST_BASED	2
+#define DM_TYPE_NONE			0
+#define DM_TYPE_BIO_BASED		1
+#define DM_TYPE_REQUEST_BASED		2
+#define DM_TYPE_MQ_REQUEST_BASED	3
 
 /*
  * List of devices that a metadevice uses and should open/close.
@@ -73,6 +74,7 @@ int dm_table_any_busy_target(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
+bool dm_table_mq_request_based(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 19296fba58e8..2646aed1d3fe 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -48,6 +48,11 @@ typedef void (*dm_dtr_fn) (struct dm_target *ti);
 typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio);
 typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
 				  union map_info *map_context);
+typedef int (*dm_clone_and_map_request_fn) (struct dm_target *ti,
+					    struct request *rq,
+					    union map_info *map_context,
+					    struct request **clone);
+typedef void (*dm_release_clone_request_fn) (struct request *clone);
 
 /*
  * Returns:
@@ -143,6 +148,8 @@ struct target_type {
 	dm_dtr_fn dtr;
 	dm_map_fn map;
 	dm_map_request_fn map_rq;
+	dm_clone_and_map_request_fn clone_and_map_rq;
+	dm_release_clone_request_fn release_clone_rq;
 	dm_endio_fn end_io;
 	dm_request_endio_fn rq_end_io;
 	dm_presuspend_fn presuspend;
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index a570d7b5796c..889f3a5b7b18 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	29
+#define DM_VERSION_MINOR	30
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2014-10-28)"
+#define DM_VERSION_EXTRA	"-ioctl (2014-12-22)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From 3cf385713460eb2bb4cb7ceb8ed89833b00b594b Mon Sep 17 00:00:00 2001
From: Antonios Motakis <a.motakis@virtualopensystems.com>
Date: Tue, 6 Jan 2015 11:15:11 +0100
Subject: ARM: 8256/1: driver coamba: add device binding path 'driver_override'

As already demonstrated with PCI [1] and the platform bus [2], a
driver_override property in sysfs can be used to bypass the id
matching of a device to a AMBA driver. This can be used by VFIO to
bind to any AMBA device requested by the user.

[1] http://lists-archives.com/linux-kernel/28030441-pci-introduce-new-device-binding-path-using-pci_dev-driver_override.html
[2] https://www.redhat.com/archives/libvir-list/2014-April/msg00382.html

Signed-off-by: Antonios Motakis <a.motakis@virtualopensystems.com>
Reviewed-by: Kim Phillips <kim.phillips@freescale.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 Documentation/ABI/testing/sysfs-bus-amba | 20 ++++++++++++++
 drivers/amba/bus.c                       | 47 ++++++++++++++++++++++++++++++++
 include/linux/amba/bus.h                 |  1 +
 3 files changed, 68 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-amba

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-amba b/Documentation/ABI/testing/sysfs-bus-amba
new file mode 100644
index 000000000000..e7b54677cfbe
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-amba
@@ -0,0 +1,20 @@
+What:		/sys/bus/amba/devices/.../driver_override
+Date:		September 2014
+Contact:	Antonios Motakis <a.motakis@virtualopensystems.com>
+Description:
+		This file allows the driver for a device to be specified which
+		will override standard OF, ACPI, ID table, and name matching.
+		When specified, only a driver with a name matching the value
+		written to driver_override will have an opportunity to bind to
+		the device. The override is specified by writing a string to the
+		driver_override file (echo vfio-amba > driver_override)	and may
+		be cleared with an empty string (echo > driver_override).
+		This returns the device to standard matching rules binding.
+		Writing to driver_override does not automatically unbind the
+		device from its current driver or make any attempt to
+		automatically load the specified driver. If no driver with a
+		matching name is currently loaded in the kernel, the device will
+		not bind to any driver. This also allows devices to opt-out of
+		driver binding using a driver_override name such as "none".
+		Only a single driver may be specified in the override, there is
+		no support for parsing delimiters.
diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index 52ddd9fbb55e..f0099360039e 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -18,6 +18,7 @@
 #include <linux/pm_domain.h>
 #include <linux/amba/bus.h>
 #include <linux/sizes.h>
+#include <linux/limits.h>
 
 #include <asm/irq.h>
 
@@ -43,6 +44,10 @@ static int amba_match(struct device *dev, struct device_driver *drv)
 	struct amba_device *pcdev = to_amba_device(dev);
 	struct amba_driver *pcdrv = to_amba_driver(drv);
 
+	/* When driver_override is set, only bind to the matching driver */
+	if (pcdev->driver_override)
+		return !strcmp(pcdev->driver_override, drv->name);
+
 	return amba_lookup(pcdrv->id_table, pcdev) != NULL;
 }
 
@@ -59,6 +64,47 @@ static int amba_uevent(struct device *dev, struct kobj_uevent_env *env)
 	return retval;
 }
 
+static ssize_t driver_override_show(struct device *_dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct amba_device *dev = to_amba_device(_dev);
+
+	if (!dev->driver_override)
+		return 0;
+
+	return sprintf(buf, "%s\n", dev->driver_override);
+}
+
+static ssize_t driver_override_store(struct device *_dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct amba_device *dev = to_amba_device(_dev);
+	char *driver_override, *old = dev->driver_override, *cp;
+
+	if (count > PATH_MAX)
+		return -EINVAL;
+
+	driver_override = kstrndup(buf, count, GFP_KERNEL);
+	if (!driver_override)
+		return -ENOMEM;
+
+	cp = strchr(driver_override, '\n');
+	if (cp)
+		*cp = '\0';
+
+	if (strlen(driver_override)) {
+		dev->driver_override = driver_override;
+	} else {
+	       kfree(driver_override);
+	       dev->driver_override = NULL;
+	}
+
+	kfree(old);
+
+	return count;
+}
+
 #define amba_attr_func(name,fmt,arg...)					\
 static ssize_t name##_show(struct device *_dev,				\
 			   struct device_attribute *attr, char *buf)	\
@@ -81,6 +127,7 @@ amba_attr_func(resource, "\t%016llx\t%016llx\t%016lx\n",
 static struct device_attribute amba_dev_attrs[] = {
 	__ATTR_RO(id),
 	__ATTR_RO(resource),
+	__ATTR_RW(driver_override),
 	__ATTR_NULL,
 };
 
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 0ab5f8e0dea2..50fc66868402 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -33,6 +33,7 @@ struct amba_device {
 	struct clk		*pclk;
 	unsigned int		periphid;
 	unsigned int		irq[AMBA_NR_IRQS];
+	char			*driver_override;
 };
 
 struct amba_driver {
-- 
cgit v1.2.3


From 201f201c33220f53856fd300e1990b779538d67f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 10 Feb 2015 13:31:34 -0700
Subject: blk-mq: make blk_mq_run_queues() static

We no longer use it outside of blk-mq.c, so we can make it static
and stop exporting it. Additionally, kill the 'async' argument, as
there's only one used of it.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 9 ++++-----
 include/linux/blk-mq.h | 1 -
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index eb8e694fda06..1e4d4599d9c5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -33,6 +33,7 @@ static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
+static void blk_mq_run_queues(struct request_queue *q);
 
 /*
  * Check if any of the ctx's have pending work in this hardware queue
@@ -117,7 +118,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
 
 	if (freeze) {
 		percpu_ref_kill(&q->mq_usage_counter);
-		blk_mq_run_queues(q, false);
+		blk_mq_run_queues(q);
 	}
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
@@ -853,7 +854,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 			&hctx->run_work, 0);
 }
 
-void blk_mq_run_queues(struct request_queue *q, bool async)
+static void blk_mq_run_queues(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
@@ -864,10 +865,9 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
 		    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 			continue;
 
-		blk_mq_run_hw_queue(hctx, async);
+		blk_mq_run_hw_queue(hctx, false);
 	}
 }
-EXPORT_SYMBOL(blk_mq_run_queues);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
@@ -905,7 +905,6 @@ void blk_mq_start_hw_queues(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queues);
 
-
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 {
 	struct blk_mq_hw_ctx *hctx;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 86b08b1a5eba..ac6c7f534de1 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -175,7 +175,6 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
 void blk_mq_insert_request(struct request *, bool, bool, bool);
-void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
-- 
cgit v1.2.3


From d427e3c82ef4fc5fbb22c0cef0b040e6767b1028 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Feb 2015 14:07:50 +0100
Subject: block: remove unused function blk_bio_map_sg

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c      | 29 -----------------------------
 include/linux/blkdev.h |  2 --
 2 files changed, 31 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 9476b1528ded..fc1ff3b1ea1f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -283,35 +283,6 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 }
 EXPORT_SYMBOL(blk_rq_map_sg);
 
-/**
- * blk_bio_map_sg - map a bio to a scatterlist
- * @q: request_queue in question
- * @bio: bio being mapped
- * @sglist: scatterlist being mapped
- *
- * Note:
- *    Caller must make sure sg can hold bio->bi_phys_segments entries
- *
- * Will return the number of sg entries setup
- */
-int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
-		   struct scatterlist *sglist)
-{
-	struct scatterlist *sg = NULL;
-	int nsegs;
-	struct bio *next = bio->bi_next;
-	bio->bi_next = NULL;
-
-	nsegs = __blk_bios_map_sg(q, bio, sglist, &sg);
-	bio->bi_next = next;
-	if (sg)
-		sg_mark_end(sg);
-
-	BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
-	return nsegs;
-}
-EXPORT_SYMBOL(blk_bio_map_sg);
-
 static inline int ll_new_hw_segment(struct request_queue *q,
 				    struct request *req,
 				    struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bf4ef666d191..7f9a516f24de 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1049,8 +1049,6 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
-extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
-			  struct scatterlist *sglist);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern long nr_blockdev_pages(void);
 
-- 
cgit v1.2.3


From f7219b527b5710ae0c4add8faa4d0ea529722cb5 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 10 Feb 2015 12:55:03 -0800
Subject: treewide: Remove unnecessary BCMA_CORETABLE_END macro

Use the normal {} instead of a macro to terminate an array.

Remove the macro too.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bgmac.c                 | 2 +-
 drivers/net/wireless/b43/main.c                       | 2 +-
 drivers/net/wireless/brcm80211/brcmsmac/mac80211_if.c | 2 +-
 drivers/spi/spi-bcm53xx.c                             | 2 +-
 drivers/usb/host/bcma-hcd.c                           | 2 +-
 include/linux/mod_devicetable.h                       | 2 --
 6 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c
index 3007d95fbb9f..ea63cb085d19 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -21,7 +21,7 @@
 static const struct bcma_device_id bgmac_bcma_tbl[] = {
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_4706_MAC_GBIT, BCMA_ANY_REV, BCMA_ANY_CLASS),
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_MAC_GBIT, BCMA_ANY_REV, BCMA_ANY_CLASS),
-	BCMA_CORETABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(bcma, bgmac_bcma_tbl);
 
diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c
index 2c9088633ec6..6acb2b51ab8f 100644
--- a/drivers/net/wireless/b43/main.c
+++ b/drivers/net/wireless/b43/main.c
@@ -127,7 +127,7 @@ static const struct bcma_device_id b43_bcma_tbl[] = {
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_80211, 0x1E, BCMA_ANY_CLASS),
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_80211, 0x28, BCMA_ANY_CLASS),
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_80211, 0x2A, BCMA_ANY_CLASS),
-	BCMA_CORETABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(bcma, b43_bcma_tbl);
 #endif
diff --git a/drivers/net/wireless/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/brcm80211/brcmsmac/mac80211_if.c
index f95b52442281..48135063347e 100644
--- a/drivers/net/wireless/brcm80211/brcmsmac/mac80211_if.c
+++ b/drivers/net/wireless/brcm80211/brcmsmac/mac80211_if.c
@@ -99,7 +99,7 @@ static struct bcma_device_id brcms_coreid_table[] = {
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_80211, 17, BCMA_ANY_CLASS),
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_80211, 23, BCMA_ANY_CLASS),
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_80211, 24, BCMA_ANY_CLASS),
-	BCMA_CORETABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(bcma, brcms_coreid_table);
 
diff --git a/drivers/spi/spi-bcm53xx.c b/drivers/spi/spi-bcm53xx.c
index 17b34cbadc03..3fb91c81015a 100644
--- a/drivers/spi/spi-bcm53xx.c
+++ b/drivers/spi/spi-bcm53xx.c
@@ -216,7 +216,7 @@ static struct spi_board_info bcm53xx_info = {
 
 static const struct bcma_device_id bcm53xxspi_bcma_tbl[] = {
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_NS_QSPI, BCMA_ANY_REV, BCMA_ANY_CLASS),
-	BCMA_CORETABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(bcma, bcm53xxspi_bcma_tbl);
 
diff --git a/drivers/usb/host/bcma-hcd.c b/drivers/usb/host/bcma-hcd.c
index cd6d0afb6b8f..526cfab41d5f 100644
--- a/drivers/usb/host/bcma-hcd.c
+++ b/drivers/usb/host/bcma-hcd.c
@@ -306,7 +306,7 @@ static int bcma_hcd_resume(struct bcma_device *dev)
 
 static const struct bcma_device_id bcma_hcd_table[] = {
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_USB20_HOST, BCMA_ANY_REV, BCMA_ANY_CLASS),
-	BCMA_CORETABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(bcma, bcma_hcd_table);
 
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 745def862580..6711d57f9c05 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -381,8 +381,6 @@ struct bcma_device_id {
 } __attribute__((packed,aligned(2)));
 #define BCMA_CORE(_manuf, _id, _rev, _class)  \
 	{ .manuf = _manuf, .id = _id, .rev = _rev, .class = _class, }
-#define BCMA_CORETABLE_END  \
-	{ 0, },
 
 #define BCMA_ANY_MANUF		0xFFFF
 #define BCMA_ANY_ID		0xFFFF
-- 
cgit v1.2.3


From 673e2baaa6d986e2fcd9c867661d8113f6c7dc7b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 10 Feb 2015 13:19:24 -0800
Subject: treewide: Remove unnecessary SSB_DEVTABLE_END macro

Use the normal {} instead of a macro to terminate an array.

Remove the macro too.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/b44.c   | 2 +-
 drivers/net/wireless/b43/main.c       | 2 +-
 drivers/net/wireless/b43legacy/main.c | 2 +-
 drivers/ssb/driver_gige.c             | 2 +-
 drivers/usb/host/ssb-hcd.c            | 2 +-
 include/linux/mod_devicetable.h       | 2 --
 6 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index d86d6baf9681..bd5916a60cb5 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -121,7 +121,7 @@ static struct pci_driver b44_pci_driver = {
 
 static const struct ssb_device_id b44_ssb_tbl[] = {
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_ETHERNET, SSB_ANY_REV),
-	SSB_DEVTABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(ssb, b44_ssb_tbl);
 
diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c
index 6acb2b51ab8f..ccbdb05b28cd 100644
--- a/drivers/net/wireless/b43/main.c
+++ b/drivers/net/wireless/b43/main.c
@@ -144,7 +144,7 @@ static const struct ssb_device_id b43_ssb_tbl[] = {
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 13),
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 15),
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 16),
-	SSB_DEVTABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(ssb, b43_ssb_tbl);
 #endif
diff --git a/drivers/net/wireless/b43legacy/main.c b/drivers/net/wireless/b43legacy/main.c
index 1aec2146a2bf..4e58c0069830 100644
--- a/drivers/net/wireless/b43legacy/main.c
+++ b/drivers/net/wireless/b43legacy/main.c
@@ -86,7 +86,7 @@ MODULE_PARM_DESC(fwpostfix, "Postfix for the firmware files to load.");
 static const struct ssb_device_id b43legacy_ssb_tbl[] = {
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 2),
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 4),
-	SSB_DEVTABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(ssb, b43legacy_ssb_tbl);
 
diff --git a/drivers/ssb/driver_gige.c b/drivers/ssb/driver_gige.c
index 21f71a1581fa..e9734051e3c4 100644
--- a/drivers/ssb/driver_gige.c
+++ b/drivers/ssb/driver_gige.c
@@ -24,7 +24,7 @@ MODULE_LICENSE("GPL");
 
 static const struct ssb_device_id ssb_gige_tbl[] = {
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_ETHERNET_GBIT, SSB_ANY_REV),
-	SSB_DEVTABLE_END
+	{},
 };
 /* MODULE_DEVICE_TABLE(ssb, ssb_gige_tbl); */
 
diff --git a/drivers/usb/host/ssb-hcd.c b/drivers/usb/host/ssb-hcd.c
index 0196f766df73..ffc32f4b1b1b 100644
--- a/drivers/usb/host/ssb-hcd.c
+++ b/drivers/usb/host/ssb-hcd.c
@@ -251,7 +251,7 @@ static const struct ssb_device_id ssb_hcd_table[] = {
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_USB11_HOSTDEV, SSB_ANY_REV),
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_USB11_HOST, SSB_ANY_REV),
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_USB20_HOST, SSB_ANY_REV),
-	SSB_DEVTABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(ssb, ssb_hcd_table);
 
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 6711d57f9c05..2bb97175b6cc 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -365,8 +365,6 @@ struct ssb_device_id {
 } __attribute__((packed, aligned(2)));
 #define SSB_DEVICE(_vendor, _coreid, _revision)  \
 	{ .vendor = _vendor, .coreid = _coreid, .revision = _revision, }
-#define SSB_DEVTABLE_END  \
-	{ 0, },
 
 #define SSB_ANY_VENDOR		0xFFFF
 #define SSB_ANY_ID		0xFFFF
-- 
cgit v1.2.3


From 26c4f7da3e413da697a7beb22ad496390eda7da0 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 10 Feb 2015 16:30:27 -0800
Subject: net: Fix remcsum in GRO path to not change packet

Remote checksum offload processing is currently the same for both
the GRO and non-GRO path. When the remote checksum offload option
is encountered, the checksum field referred to is modified in
the packet. So in the GRO case, the packet is modified in the
GRO path and then the operation is skipped when the packet goes
through the normal path based on skb->remcsum_offload. There is
a problem in that the packet may be modified in the GRO path, but
then forwarded off host still containing the remote checksum option.
A remote host will again perform RCO but now the checksum verification
will fail since GRO RCO already modified the checksum.

To fix this, we ensure that GRO restores a packet to it's original
state before returning. In this model, when GRO processes a remote
checksum option it still changes the checksum per the algorithm
but on return from lower layer processing the checksum is restored
to its original value.

In this patch we add define gro_remcsum structure which is passed
to skb_gro_remcsum_process to save offset and delta for the checksum
being changed. After lower layer processing, skb_gro_remcsum_cleanup
is called to restore the checksum before returning from GRO.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       | 19 +++++++++----------
 include/linux/netdevice.h | 25 +++++++++++++++++++++++--
 include/net/checksum.h    |  5 +++++
 net/ipv4/fou.c            | 20 ++++++++++----------
 4 files changed, 47 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 0e57e862c399..30310a63475a 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -555,12 +555,12 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 					  unsigned int off,
 					  struct vxlanhdr *vh, size_t hdrlen,
-					  u32 data)
+					  u32 data, struct gro_remcsum *grc)
 {
 	size_t start, offset, plen;
 
 	if (skb->remcsum_offload)
-		return vh;
+		return NULL;
 
 	if (!NAPI_GRO_CB(skb)->csum_valid)
 		return NULL;
@@ -579,7 +579,8 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 			return NULL;
 	}
 
-	skb_gro_remcsum_process(skb, (void *)vh + hdrlen, start, offset);
+	skb_gro_remcsum_process(skb, (void *)vh + hdrlen,
+				start, offset, grc);
 
 	skb->remcsum_offload = 1;
 
@@ -597,6 +598,9 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 	struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock,
 					     udp_offloads);
 	u32 flags;
+	struct gro_remcsum grc;
+
+	skb_gro_remcsum_init(&grc);
 
 	off_vx = skb_gro_offset(skb);
 	hlen = off_vx + sizeof(*vh);
@@ -614,7 +618,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 
 	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
 		vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
-				       ntohl(vh->vx_vni));
+				       ntohl(vh->vx_vni), &grc);
 
 		if (!vh)
 			goto out;
@@ -637,6 +641,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 	pp = eth_gro_receive(head, skb);
 
 out:
+	skb_gro_remcsum_cleanup(skb, &grc);
 	NAPI_GRO_CB(skb)->flush |= flush;
 
 	return pp;
@@ -1154,12 +1159,6 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 {
 	size_t start, offset, plen;
 
-	if (skb->remcsum_offload) {
-		/* Already processed in GRO path */
-		skb->remcsum_offload = 0;
-		return vh;
-	}
-
 	start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
 	offset = start + ((data & VXLAN_RCO_UDP) ?
 			  offsetof(struct udphdr, check) :
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d115256ed5a2..3aa02457fe4f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2321,8 +2321,19 @@ do {									\
 					   compute_pseudo(skb, proto));	\
 } while (0)
 
+struct gro_remcsum {
+	int offset;
+	__wsum delta;
+};
+
+static inline void skb_gro_remcsum_init(struct gro_remcsum *grc)
+{
+	grc->delta = 0;
+}
+
 static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
-					   int start, int offset)
+					   int start, int offset,
+					   struct gro_remcsum *grc)
 {
 	__wsum delta;
 
@@ -2331,10 +2342,20 @@ static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
 	delta = remcsum_adjust(ptr, NAPI_GRO_CB(skb)->csum, start, offset);
 
 	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
 	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+
+	grc->offset = (ptr + offset) - (void *)skb->head;
+	grc->delta = delta;
 }
 
+static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
+					   struct gro_remcsum *grc)
+{
+	if (!grc->delta)
+		return;
+
+	remcsum_unadjust((__sum16 *)(skb->head + grc->offset), grc->delta);
+}
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
diff --git a/include/net/checksum.h b/include/net/checksum.h
index e339a9513e29..0a55ac715077 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -167,4 +167,9 @@ static inline __wsum remcsum_adjust(void *ptr, __wsum csum,
 	return delta;
 }
 
+static inline void remcsum_unadjust(__sum16 *psum, __wsum delta)
+{
+	*psum = csum_fold(csum_sub(delta, *psum));
+}
+
 #endif
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 92ddea1e6457..7fa8d36e56d4 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -71,12 +71,6 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
 	size_t offset = ntohs(pd[1]);
 	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
 
-	if (skb->remcsum_offload) {
-		/* Already processed in GRO path */
-		skb->remcsum_offload = 0;
-		return guehdr;
-	}
-
 	if (!pskb_may_pull(skb, plen))
 		return NULL;
 	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
@@ -214,7 +208,8 @@ out_unlock:
 
 static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 				      struct guehdr *guehdr, void *data,
-				      size_t hdrlen, u8 ipproto)
+				      size_t hdrlen, u8 ipproto,
+				      struct gro_remcsum *grc)
 {
 	__be16 *pd = data;
 	size_t start = ntohs(pd[0]);
@@ -222,7 +217,7 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
 
 	if (skb->remcsum_offload)
-		return guehdr;
+		return NULL;
 
 	if (!NAPI_GRO_CB(skb)->csum_valid)
 		return NULL;
@@ -234,7 +229,8 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 			return NULL;
 	}
 
-	skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
+	skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen,
+				start, offset, grc);
 
 	skb->remcsum_offload = 1;
 
@@ -254,6 +250,9 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
 	void *data;
 	u16 doffset = 0;
 	int flush = 1;
+	struct gro_remcsum grc;
+
+	skb_gro_remcsum_init(&grc);
 
 	off = skb_gro_offset(skb);
 	len = off + sizeof(*guehdr);
@@ -295,7 +294,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
 		if (flags & GUE_PFLAG_REMCSUM) {
 			guehdr = gue_gro_remcsum(skb, off, guehdr,
 						 data + doffset, hdrlen,
-						 guehdr->proto_ctype);
+						 guehdr->proto_ctype, &grc);
 			if (!guehdr)
 				goto out;
 
@@ -345,6 +344,7 @@ out_unlock:
 	rcu_read_unlock();
 out:
 	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_remcsum_cleanup(skb, &grc);
 
 	return pp;
 }
-- 
cgit v1.2.3


From 6edec0e61b1e52f9144935d28c9088f5b202e61c Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 10 Feb 2015 16:30:28 -0800
Subject: net: Clarify meaning of CHECKSUM_PARTIAL for receive path

The current meaning of CHECKSUM_PARTIAL for validating checksums
is that _all_ checksums in the packet are considered valid.
However, in the manner that CHECKSUM_PARTIAL is set only the checksum
at csum_start+csum_offset and any preceding checksums may
be considered valid. If there are checksums in the packet after
csum_offset it is possible they have not been verfied.

This patch changes CHECKSUM_PARTIAL logic in skb_csum_unnecessary and
__skb_gro_checksum_validate_needed to only considered checksums
referring to csum_start and any preceding checksums (with starting
offset before csum_start) to be verified.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 +++-
 include/linux/skbuff.h    | 17 ++++++++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3aa02457fe4f..9e9be221e4bf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2246,7 +2246,9 @@ static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
 						      bool zero_okay,
 						      __sum16 check)
 {
-	return (skb->ip_summed != CHECKSUM_PARTIAL &&
+	return ((skb->ip_summed != CHECKSUM_PARTIAL ||
+		skb_checksum_start_offset(skb) <
+		 skb_gro_offset(skb)) &&
 		NAPI_GRO_CB(skb)->csum_cnt == 0 &&
 		(!zero_okay || check));
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1bb36edb66b9..da6028a50687 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -83,11 +83,15 @@
  *
  * CHECKSUM_PARTIAL:
  *
- *   This is identical to the case for output below. This may occur on a packet
+ *   A checksum is set up to be offloaded to a device as described in the
+ *   output description for CHECKSUM_PARTIAL. This may occur on a packet
  *   received directly from another Linux OS, e.g., a virtualized Linux kernel
- *   on the same host. The packet can be treated in the same way as
- *   CHECKSUM_UNNECESSARY, except that on output (i.e., forwarding) the
- *   checksum must be filled in by the OS or the hardware.
+ *   on the same host, or it may be set in the input path in GRO or remote
+ *   checksum offload. For the purposes of checksum verification, the checksum
+ *   referred to by skb->csum_start + skb->csum_offset and any preceding
+ *   checksums in the packet are considered verified. Any checksums in the
+ *   packet that are after the checksum being offloaded are not considered to
+ *   be verified.
  *
  * B. Checksumming on output.
  *
@@ -2915,7 +2919,10 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb);
 
 static inline int skb_csum_unnecessary(const struct sk_buff *skb)
 {
-	return ((skb->ip_summed & CHECKSUM_UNNECESSARY) || skb->csum_valid);
+	return ((skb->ip_summed == CHECKSUM_UNNECESSARY) ||
+		skb->csum_valid ||
+		(skb->ip_summed == CHECKSUM_PARTIAL &&
+		 skb_checksum_start_offset(skb) >= 0));
 }
 
 /**
-- 
cgit v1.2.3


From baa32ff42871f2d4aca9c08c9403d0e497325564 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 10 Feb 2015 16:30:30 -0800
Subject: net: Use more bit fields in napi_gro_cb

This patch moves the free and same_flow fields to be bit fields
(2 and 1 bit sized respectively). This frees up some space for u16's.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9e9be221e4bf..43fd0a4dcd04 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1923,20 +1923,15 @@ struct napi_gro_cb {
 	/* Number of segments aggregated. */
 	u16	count;
 
-	/* This is non-zero if the packet may be of the same flow. */
-	u8	same_flow;
-
-	/* Free the skb? */
-	u8	free;
-#define NAPI_GRO_FREE		  1
-#define NAPI_GRO_FREE_STOLEN_HEAD 2
-
 	/* jiffies when first packet was created/queued */
 	unsigned long age;
 
 	/* Used in ipv6_gro_receive() and foo-over-udp */
 	u16	proto;
 
+	/* This is non-zero if the packet may be of the same flow. */
+	u8	same_flow:1;
+
 	/* Used in udp_gro_receive */
 	u8	udp_mark:1;
 
@@ -1946,9 +1941,16 @@ struct napi_gro_cb {
 	/* Number of checksums via CHECKSUM_UNNECESSARY */
 	u8	csum_cnt:3;
 
+	/* Free the skb? */
+	u8	free:2;
+#define NAPI_GRO_FREE		  1
+#define NAPI_GRO_FREE_STOLEN_HEAD 2
+
 	/* Used in foo-over-udp, set in udp[46]_gro_receive */
 	u8	is_ipv6:1;
 
+	/* 7 bit hole */
+
 	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
 	__wsum	csum;
 
-- 
cgit v1.2.3


From 15e2396d4e3ce23188852b74d924107982c63b42 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 10 Feb 2015 16:30:31 -0800
Subject: net: Infrastructure for CHECKSUM_PARTIAL with remote checsum offload

This patch adds infrastructure so that remote checksum offload can
set CHECKSUM_PARTIAL instead of calling csum_partial and writing
the modfied checksum field.

Add skb_remcsum_adjust_partial function to set an skb for using
CHECKSUM_PARTIAL with remote checksum offload.  Changed
skb_remcsum_process and skb_gro_remcsum_process to take a boolean
argument to indicate if checksum partial can be set or the
checksum needs to be modified using the normal algorithm.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       |  4 ++--
 include/linux/netdevice.h | 19 ++++++++++++++++++-
 include/linux/skbuff.h    | 15 ++++++++++++++-
 net/core/dev.c            |  1 +
 net/ipv4/fou.c            |  4 ++--
 5 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 30310a63475a..4f04443cfd33 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -580,7 +580,7 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 	}
 
 	skb_gro_remcsum_process(skb, (void *)vh + hdrlen,
-				start, offset, grc);
+				start, offset, grc, true);
 
 	skb->remcsum_offload = 1;
 
@@ -1171,7 +1171,7 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 
 	vh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
 
-	skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset);
+	skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset, true);
 
 	return vh;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 43fd0a4dcd04..5897b4ea5a3f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1923,6 +1923,9 @@ struct napi_gro_cb {
 	/* Number of segments aggregated. */
 	u16	count;
 
+	/* Start offset for remote checksum offload */
+	u16	gro_remcsum_start;
+
 	/* jiffies when first packet was created/queued */
 	unsigned long age;
 
@@ -2244,6 +2247,12 @@ static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
 
 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb);
 
+static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb)
+{
+	return (NAPI_GRO_CB(skb)->gro_remcsum_start - skb_headroom(skb) ==
+		skb_gro_offset(skb));
+}
+
 static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
 						      bool zero_okay,
 						      __sum16 check)
@@ -2251,6 +2260,7 @@ static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
 	return ((skb->ip_summed != CHECKSUM_PARTIAL ||
 		skb_checksum_start_offset(skb) <
 		 skb_gro_offset(skb)) &&
+		!skb_at_gro_remcsum_start(skb) &&
 		NAPI_GRO_CB(skb)->csum_cnt == 0 &&
 		(!zero_okay || check));
 }
@@ -2337,12 +2347,19 @@ static inline void skb_gro_remcsum_init(struct gro_remcsum *grc)
 
 static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
 					   int start, int offset,
-					   struct gro_remcsum *grc)
+					   struct gro_remcsum *grc,
+					   bool nopartial)
 {
 	__wsum delta;
 
 	BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);
 
+	if (!nopartial) {
+		NAPI_GRO_CB(skb)->gro_remcsum_start =
+		    ((unsigned char *)ptr + start) - skb->head;
+		return;
+	}
+
 	delta = remcsum_adjust(ptr, NAPI_GRO_CB(skb)->csum, start, offset);
 
 	/* Adjust skb->csum since we changed the packet */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index da6028a50687..30007afe70b3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3104,16 +3104,29 @@ do {									\
 				       compute_pseudo(skb, proto));	\
 } while (0)
 
+static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr,
+					      u16 start, u16 offset)
+{
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	skb->csum_start = ((unsigned char *)ptr + start) - skb->head;
+	skb->csum_offset = offset - start;
+}
+
 /* Update skbuf and packet to reflect the remote checksum offload operation.
  * When called, ptr indicates the starting point for skb->csum when
  * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete
  * here, skb_postpull_rcsum is done so skb->csum start is ptr.
  */
 static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
-				       int start, int offset)
+				       int start, int offset, bool nopartial)
 {
 	__wsum delta;
 
+	if (!nopartial) {
+		skb_remcsum_adjust_partial(skb, ptr, start, offset);
+		return;
+	}
+
 	 if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
 		__skb_checksum_complete(skb);
 		skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
diff --git a/net/core/dev.c b/net/core/dev.c
index d030575532a2..48c6ecb18f3c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4024,6 +4024,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		NAPI_GRO_CB(skb)->flush = 0;
 		NAPI_GRO_CB(skb)->free = 0;
 		NAPI_GRO_CB(skb)->udp_mark = 0;
+		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 
 		/* Setup for GRO checksum validation */
 		switch (skb->ip_summed) {
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 7fa8d36e56d4..d320f575cc62 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -75,7 +75,7 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
 		return NULL;
 	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
 
-	skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
+	skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset, true);
 
 	return guehdr;
 }
@@ -230,7 +230,7 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 	}
 
 	skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen,
-				start, offset, grc);
+				start, offset, grc, true);
 
 	skb->remcsum_offload = 1;
 
-- 
cgit v1.2.3


From 119ee9144534141822462e3e8a5ccc8dc537f712 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 29 Jan 2015 11:45:33 -0800
Subject: f2fs: split UMOUNT and FASTBOOT flags

This patch adds FASTBOOT flag into checkpoint as follows.

 - CP_UMOUNT_FLAG is set when system is umounted.
 - CP_FASTBOOT_FLAG is set when intermediate checkpoint having node summaries
   was done.

So, if you get CP_UMOUNT_FLAG from checkpoint, the system was umounted cleanly.
Instead, if there was sudden-power-off, you can get CP_FASTBOOT_FLAG or nothing.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c        | 19 +++++++++++++------
 fs/f2fs/f2fs.h              | 23 +++++++++++++++++++++++
 fs/f2fs/gc.c                |  3 +--
 fs/f2fs/segment.c           | 11 +++++------
 fs/f2fs/super.c             |  5 ++---
 include/linux/f2fs_fs.h     |  1 +
 include/trace/events/f2fs.h |  1 +
 7 files changed, 46 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 22165fb1d0d1..f7cdcad31943 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -956,17 +956,24 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
 			orphan_blocks);
 
-	if (cpc->reason == CP_UMOUNT) {
-		set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+	if (__remain_node_summaries(cpc->reason))
 		ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
 				cp_payload_blks + data_sum_blocks +
 				orphan_blocks + NR_CURSEG_NODE_TYPE);
-	} else {
-		clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+	else
 		ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
 				cp_payload_blks + data_sum_blocks +
 				orphan_blocks);
-	}
+
+	if (cpc->reason == CP_UMOUNT)
+		set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+	else
+		clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+
+	if (cpc->reason == CP_FASTBOOT)
+		set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+	else
+		clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
 
 	if (orphan_num)
 		set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
@@ -1010,7 +1017,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	write_data_summaries(sbi, start_blk);
 	start_blk += data_sum_blocks;
-	if (cpc->reason == CP_UMOUNT) {
+	if (__remain_node_summaries(cpc->reason)) {
 		write_node_summaries(sbi, start_blk);
 		start_blk += NR_CURSEG_NODE_TYPE;
 	}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5abe0836c179..8231a599a305 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -100,6 +100,7 @@ enum {
 
 enum {
 	CP_UMOUNT,
+	CP_FASTBOOT,
 	CP_SYNC,
 	CP_DISCARD,
 };
@@ -764,6 +765,28 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
 	up_write(&sbi->cp_rwsem);
 }
 
+static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
+{
+	int reason = CP_SYNC;
+
+	if (test_opt(sbi, FASTBOOT))
+		reason = CP_FASTBOOT;
+	if (is_sbi_flag_set(sbi, SBI_IS_CLOSE))
+		reason = CP_UMOUNT;
+	return reason;
+}
+
+static inline bool __remain_node_summaries(int reason)
+{
+	return (reason == CP_UMOUNT || reason == CP_FASTBOOT);
+}
+
+static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
+{
+	return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) ||
+			is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG));
+}
+
 /*
  * Check whether the given nid is within node id range.
  */
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ba89e27f394f..76adbc3641f1 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -698,8 +698,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
 		.iroot = RADIX_TREE_INIT(GFP_NOFS),
 	};
 
-	cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC;
-
+	cpc.reason = __get_cp_reason(sbi);
 gc_more:
 	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
 		goto stop;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 31c4e5702c7d..5ea57ec153d1 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1401,7 +1401,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 		segno = le32_to_cpu(ckpt->cur_data_segno[type]);
 		blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
 							CURSEG_HOT_DATA]);
-		if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+		if (__exist_node_summaries(sbi))
 			blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
 		else
 			blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
@@ -1410,7 +1410,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 							CURSEG_HOT_NODE]);
 		blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
 							CURSEG_HOT_NODE]);
-		if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+		if (__exist_node_summaries(sbi))
 			blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
 							type - CURSEG_HOT_NODE);
 		else
@@ -1421,7 +1421,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 	sum = (struct f2fs_summary_block *)page_address(new);
 
 	if (IS_NODESEG(type)) {
-		if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) {
+		if (__exist_node_summaries(sbi)) {
 			struct f2fs_summary *ns = &sum->entries[0];
 			int i;
 			for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
@@ -1470,7 +1470,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
 		type = CURSEG_HOT_NODE;
 	}
 
-	if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
+	if (__exist_node_summaries(sbi))
 		ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
 					NR_CURSEG_TYPE - type, META_CP);
 
@@ -1567,8 +1567,7 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
 
 void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
 {
-	if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
-		write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
+	write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
 }
 
 int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 06d903b5d6c8..bfeab3c81a48 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -500,9 +500,8 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 	if (sync) {
 		struct cp_control cpc;
 
-		cpc.reason = (test_opt(sbi, FASTBOOT) ||
-					is_sbi_flag_set(sbi, SBI_IS_CLOSE)) ?
-						CP_UMOUNT : CP_SYNC;
+		cpc.reason = __get_cp_reason(sbi);
+
 		mutex_lock(&sbi->gc_mutex);
 		write_checkpoint(sbi, &cpc);
 		mutex_unlock(&sbi->gc_mutex);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index e993b0bc9abf..09805720f4bf 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -87,6 +87,7 @@ struct f2fs_super_block {
 /*
  * For checkpoint
  */
+#define CP_FASTBOOT_FLAG	0x00000020
 #define CP_FSCK_FLAG		0x00000010
 #define CP_ERROR_FLAG		0x00000008
 #define CP_COMPACT_SUM_FLAG	0x00000004
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 5e1c0292250c..69629826c2ba 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -72,6 +72,7 @@
 #define show_cpreason(type)						\
 	__print_symbolic(type,						\
 		{ CP_UMOUNT,	"Umount" },				\
+		{ CP_FASTBOOT,	"Fastboot" },				\
 		{ CP_SYNC,	"Sync" },				\
 		{ CP_DISCARD,	"Discard" })
 
-- 
cgit v1.2.3


From f7ef9b83b583640111039b30e13263b71c3a6ed5 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 9 Feb 2015 12:02:44 -0800
Subject: f2fs: introduce macros to convert bytes and blocks in f2fs

This patch adds two macros for transition between byte and block offsets.
Currently, f2fs only supports 4KB blocks, so use the default size for now.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c    | 7 +++----
 fs/f2fs/file.c          | 3 +--
 fs/f2fs/segment.c       | 8 ++++----
 include/linux/f2fs_fs.h | 4 ++++
 4 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 31a715b5fd5c..58d88df0d632 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -981,15 +981,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	/* write out checkpoint buffer at block 0 */
 	cp_page = grab_meta_page(sbi, start_blk++);
 	kaddr = page_address(cp_page);
-	memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+	memcpy(kaddr, ckpt, F2FS_BLKSIZE);
 	set_page_dirty(cp_page);
 	f2fs_put_page(cp_page, 1);
 
 	for (i = 1; i < 1 + cp_payload_blks; i++) {
 		cp_page = grab_meta_page(sbi, start_blk++);
 		kaddr = page_address(cp_page);
-		memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE,
-				(1 << sbi->log_blocksize));
+		memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE);
 		set_page_dirty(cp_page);
 		f2fs_put_page(cp_page, 1);
 	}
@@ -1009,7 +1008,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	/* writeout checkpoint block */
 	cp_page = grab_meta_page(sbi, start_blk);
 	kaddr = page_address(cp_page);
-	memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+	memcpy(kaddr, ckpt, F2FS_BLKSIZE);
 	set_page_dirty(cp_page);
 	f2fs_put_page(cp_page, 1);
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7188a2ac64b5..f3b007540a48 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -491,8 +491,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
 
 	trace_f2fs_truncate_blocks_enter(inode, from);
 
-	free_from = (pgoff_t)
-		((from + blocksize - 1) >> (sbi->log_blocksize));
+	free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
 
 	if (lock)
 		f2fs_lock_op(sbi);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 9f278d156d88..877a272a8146 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1048,8 +1048,8 @@ static const struct segment_allocation default_salloc_ops = {
 
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 {
-	__u64 start = range->start >> sbi->log_blocksize;
-	__u64 end = start + (range->len >> sbi->log_blocksize) - 1;
+	__u64 start = F2FS_BYTES_TO_BLK(range->start);
+	__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
 	unsigned int start_segno, end_segno;
 	struct cp_control cpc;
 
@@ -1066,7 +1066,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
 						GET_SEGNO(sbi, end);
 	cpc.reason = CP_DISCARD;
-	cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
+	cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen);
 
 	/* do checkpoint to issue discard commands safely */
 	for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
@@ -1080,7 +1080,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 		mutex_unlock(&sbi->gc_mutex);
 	}
 out:
-	range->len = cpc.trimmed << sbi->log_blocksize;
+	range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
 	return 0;
 }
 
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 09805720f4bf..a23556c32703 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -19,12 +19,16 @@
 #define F2FS_MAX_LOG_SECTOR_SIZE	12	/* 12 bits for 4096 bytes */
 #define F2FS_LOG_SECTORS_PER_BLOCK	3	/* log number for sector/blk */
 #define F2FS_BLKSIZE			4096	/* support only 4KB block */
+#define F2FS_BLKSIZE_BITS		12	/* bits for F2FS_BLKSIZE */
 #define F2FS_MAX_EXTENSION		64	/* # of extension entries */
 #define F2FS_BLK_ALIGN(x)	(((x) + F2FS_BLKSIZE - 1) / F2FS_BLKSIZE)
 
 #define NULL_ADDR		((block_t)0)	/* used as block_t addresses */
 #define NEW_ADDR		((block_t)-1)	/* used as block_t addresses */
 
+#define F2FS_BYTES_TO_BLK(bytes)	((bytes) >> F2FS_BLKSIZE_BITS)
+#define F2FS_BLK_TO_BYTES(blk)		((blk) << F2FS_BLKSIZE_BITS)
+
 /* 0, 1(node nid), 2(meta nid) are reserved node id */
 #define F2FS_RESERVED_NODE_NUM		3
 
-- 
cgit v1.2.3


From 5d833062139d290adb8b62c093b654a01a353448 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:16 -0800
Subject: mm: numa: do not dereference pmd outside of the lock during NUMA
 hinting fault

Automatic NUMA balancing depends on being able to protect PTEs to trap a
fault and gather reference locality information.  Very broadly speaking
it would mark PTEs as not present and use another bit to distinguish
between NUMA hinting faults and other types of faults.  It was
universally loved by everybody and caused no problems whatsoever.  That
last sentence might be a lie.

This series is very heavily based on patches from Linus and Aneesh to
replace the existing PTE/PMD NUMA helper functions with normal change
protections.  I did alter and add parts of it but I consider them
relatively minor contributions.  At their suggestion, acked-bys are in
there but I've no problem converting them to Signed-off-by if requested.

AFAIK, this has received no testing on ppc64 and I'm depending on Aneesh
for that.  I tested trinity under kvm-tool and passed and ran a few
other basic tests.  At the time of writing, only the short-lived tests
have completed but testing of V2 indicated that long-term testing had no
surprises.  In most cases I'm leaving out detail as it's not that
interesting.

specjbb single JVM: There was negligible performance difference in the
	benchmark itself for short runs. However, system activity is
	higher and interrupts are much higher over time -- possibly TLB
	flushes. Migrations are also higher. Overall, this is more overhead
	but considering the problems faced with the old approach I think
	we just have to suck it up and find another way of reducing the
	overhead.

specjbb multi JVM: Negligible performance difference to the actual benchmark
	but like the single JVM case, the system overhead is noticeably
	higher.  Again, interrupts are a major factor.

autonumabench: This was all over the place and about all that can be
	reasonably concluded is that it's different but not necessarily
	better or worse.

autonumabench
                                     3.18.0-rc5            3.18.0-rc5
                                 mmotm-20141119         protnone-v3r3
User    NUMA01               32380.24 (  0.00%)    21642.92 ( 33.16%)
User    NUMA01_THEADLOCAL    22481.02 (  0.00%)    22283.22 (  0.88%)
User    NUMA02                3137.00 (  0.00%)     3116.54 (  0.65%)
User    NUMA02_SMT            1614.03 (  0.00%)     1543.53 (  4.37%)
System  NUMA01                 322.97 (  0.00%)     1465.89 (-353.88%)
System  NUMA01_THEADLOCAL       91.87 (  0.00%)       49.32 ( 46.32%)
System  NUMA02                  37.83 (  0.00%)       14.61 ( 61.38%)
System  NUMA02_SMT               7.36 (  0.00%)        7.45 ( -1.22%)
Elapsed NUMA01                 716.63 (  0.00%)      599.29 ( 16.37%)
Elapsed NUMA01_THEADLOCAL      553.98 (  0.00%)      539.94 (  2.53%)
Elapsed NUMA02                  83.85 (  0.00%)       83.04 (  0.97%)
Elapsed NUMA02_SMT              86.57 (  0.00%)       79.15 (  8.57%)
CPU     NUMA01                4563.00 (  0.00%)     3855.00 ( 15.52%)
CPU     NUMA01_THEADLOCAL     4074.00 (  0.00%)     4136.00 ( -1.52%)
CPU     NUMA02                3785.00 (  0.00%)     3770.00 (  0.40%)
CPU     NUMA02_SMT            1872.00 (  0.00%)     1959.00 ( -4.65%)

System CPU usage of NUMA01 is worse but it's an adverse workload on this
machine so I'm reluctant to conclude that it's a problem that matters.  On
the other workloads that are sensible on this machine, system CPU usage is
great.  Overall time to complete the benchmark is comparable

          3.18.0-rc5  3.18.0-rc5
        mmotm-20141119protnone-v3r3
User        59612.50    48586.44
System        460.22     1537.45
Elapsed      1442.20     1304.29

NUMA alloc hit                 5075182     5743353
NUMA alloc miss                      0           0
NUMA interleave hit                  0           0
NUMA alloc local               5075174     5743339
NUMA base PTE updates        637061448   443106883
NUMA huge PMD updates          1243434      864747
NUMA page range updates     1273699656   885857347
NUMA hint faults               1658116     1214277
NUMA hint local faults          959487      754113
NUMA hint local percent             57          62
NUMA pages migrated            5467056    61676398

The NUMA pages migrated look terrible but when I looked at a graph of the
activity over time I see that the massive spike in migration activity was
during NUMA01.  This correlates with high system CPU usage and could be
simply down to bad luck but any modifications that affect that workload
would be related to scan rates and migrations, not the protection
mechanism.  For all other workloads, migration activity was comparable.

Overall, headline performance figures are comparable but the overhead is
higher, mostly in interrupts.  To some extent, higher overhead from this
approach was anticipated but not to this degree.  It's going to be
necessary to reduce this again with a separate series in the future.  It's
still worth going ahead with this series though as it's likely to avoid
constant headaches with Xen and is probably easier to maintain.

This patch (of 10):

A transhuge NUMA hinting fault may find the page is migrating and should
wait until migration completes.  The check is race-prone because the pmd
is deferenced outside of the page lock and while the race is tiny, it'll
be larger if the PMD is cleared while marking PMDs for hinting fault.
This patch closes the race.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 4 ----
 mm/huge_memory.c        | 3 ++-
 mm/migrate.c            | 6 ------
 3 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index fab9b32ace8e..78baed5f2952 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -67,7 +67,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 
 #ifdef CONFIG_NUMA_BALANCING
 extern bool pmd_trans_migrating(pmd_t pmd);
-extern void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd);
 extern int migrate_misplaced_page(struct page *page,
 				  struct vm_area_struct *vma, int node);
 extern bool migrate_ratelimited(int node);
@@ -76,9 +75,6 @@ static inline bool pmd_trans_migrating(pmd_t pmd)
 {
 	return false;
 }
-static inline void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
-{
-}
 static inline int migrate_misplaced_page(struct page *page,
 					 struct vm_area_struct *vma, int node)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cb7be110cad3..c6921362c5fc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1272,8 +1272,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * check_same as the page may no longer be mapped.
 	 */
 	if (unlikely(pmd_trans_migrating(*pmdp))) {
+		page = pmd_page(*pmdp);
 		spin_unlock(ptl);
-		wait_migrate_huge_page(vma->anon_vma, pmdp);
+		wait_on_page_locked(page);
 		goto out;
 	}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index f98067e5d353..5e8f03a8de2a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1654,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd)
 	return PageLocked(page);
 }
 
-void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
-{
-	struct page *page = pmd_page(*pmd);
-	wait_on_page_locked(page);
-}
-
 /*
  * Attempt to migrate a misplaced page to the specified destination
  * node. Caller is expected to have an elevated reference count on
-- 
cgit v1.2.3


From 4d9424669946532be754a6e116618dcb58430cb4 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:28 -0800
Subject: mm: convert p[te|md]_mknonnuma and remaining page table manipulations

With PROT_NONE, the traditional page table manipulation functions are
sufficient.

[andre.przywara@arm.com: fix compiler warning in pmdp_invalidate()]
[akpm@linux-foundation.org: fix build with STRICT_MM_TYPECHECKS]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/pgtable-3level.h |  5 ++++-
 include/linux/huge_mm.h               |  3 +--
 mm/huge_memory.c                      | 33 +++++++--------------------------
 mm/memory.c                           | 10 ++++++----
 mm/mempolicy.c                        |  2 +-
 mm/migrate.c                          |  2 +-
 mm/mprotect.c                         |  2 +-
 mm/pgtable-generic.c                  |  2 --
 8 files changed, 21 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 18dbc82f85e5..423a5ac09d3a 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -257,7 +257,10 @@ PMD_BIT_FUNC(mkyoung,   |= PMD_SECT_AF);
 #define mk_pmd(page,prot)	pfn_pmd(page_to_pfn(page),prot)
 
 /* represent a notpresent pmd by zero, this is used by pmdp_invalidate */
-#define pmd_mknotpresent(pmd)	(__pmd(0))
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+	return __pmd(0);
+}
 
 static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 {
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f10b20f05159..062bd252e994 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -31,8 +31,7 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
 			 unsigned long new_addr, unsigned long old_end,
 			 pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-			unsigned long addr, pgprot_t newprot,
-			int prot_numa);
+			unsigned long addr, pgprot_t newprot);
 
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_FLAG,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 915941c45169..cb9b3e847dac 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1355,9 +1355,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	goto out;
 clear_pmdnuma:
 	BUG_ON(!PageLocked(page));
-	pmd = pmd_mknonnuma(pmd);
+	pmd = pmd_modify(pmd, vma->vm_page_prot);
 	set_pmd_at(mm, haddr, pmdp, pmd);
-	VM_BUG_ON(pmd_protnone(*pmdp));
 	update_mmu_cache_pmd(vma, addr, pmdp);
 	unlock_page(page);
 out_unlock:
@@ -1472,7 +1471,7 @@ out:
  *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
  */
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-		unsigned long addr, pgprot_t newprot, int prot_numa)
+		unsigned long addr, pgprot_t newprot)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	spinlock_t *ptl;
@@ -1481,29 +1480,11 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		pmd_t entry;
 		ret = 1;
-		if (!prot_numa) {
-			entry = pmdp_get_and_clear_notify(mm, addr, pmd);
-			if (pmd_protnone(entry))
-				entry = pmd_mknonnuma(entry);
-			entry = pmd_modify(entry, newprot);
-			ret = HPAGE_PMD_NR;
-			set_pmd_at(mm, addr, pmd, entry);
-			BUG_ON(pmd_write(entry));
-		} else {
-			struct page *page = pmd_page(*pmd);
-
-			/*
-			 * Do not trap faults against the zero page. The
-			 * read-only data is likely to be read-cached on the
-			 * local CPU cache and it is less useful to know about
-			 * local vs remote hits on the zero page.
-			 */
-			if (!is_huge_zero_page(page) &&
-			    !pmd_protnone(*pmd)) {
-				pmdp_set_numa(mm, addr, pmd);
-				ret = HPAGE_PMD_NR;
-			}
-		}
+		entry = pmdp_get_and_clear_notify(mm, addr, pmd);
+		entry = pmd_modify(entry, newprot);
+		ret = HPAGE_PMD_NR;
+		set_pmd_at(mm, addr, pmd, entry);
+		BUG_ON(pmd_write(entry));
 		spin_unlock(ptl);
 	}
 
diff --git a/mm/memory.c b/mm/memory.c
index 92e6a6299e86..d7921760cf79 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3018,9 +3018,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	* validation through pte_unmap_same(). It's of NUMA type but
 	* the pfn may be screwed if the read is non atomic.
 	*
-	* ptep_modify_prot_start is not called as this is clearing
-	* the _PAGE_NUMA bit and it is not really expected that there
-	* would be concurrent hardware modifications to the PTE.
+	* We can safely just do a "set_pte_at()", because the old
+	* page table entry is not accessible, so there would be no
+	* concurrent hardware modifications to the PTE.
 	*/
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
@@ -3029,7 +3029,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out;
 	}
 
-	pte = pte_mknonnuma(pte);
+	/* Make it present again */
+	pte = pte_modify(pte, vma->vm_page_prot);
+	pte = pte_mkyoung(pte);
 	set_pte_at(mm, addr, ptep, pte);
 	update_mmu_cache(vma, addr, ptep);
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f1bd23803576..c75f4dcec808 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -569,7 +569,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
 {
 	int nr_updated;
 
-	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 	if (nr_updated)
 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 5e8f03a8de2a..85e042686031 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1847,7 +1847,7 @@ out_fail:
 out_dropref:
 	ptl = pmd_lock(mm, pmd);
 	if (pmd_same(*pmd, entry)) {
-		entry = pmd_mknonnuma(entry);
+		entry = pmd_modify(entry, vma->vm_page_prot);
 		set_pmd_at(mm, mmun_start, pmd, entry);
 		update_mmu_cache_pmd(vma, address, &entry);
 	}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 44ffa698484d..76824d73380d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -142,7 +142,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 				split_huge_page_pmd(vma, addr, pmd);
 			else {
 				int nr_ptes = change_huge_pmd(vma, pmd, addr,
-						newprot, prot_numa);
+						newprot);
 
 				if (nr_ptes) {
 					if (nr_ptes == HPAGE_PMD_NR) {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4b8ad760dde3..c25f94b33811 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 		     pmd_t *pmdp)
 {
 	pmd_t entry = *pmdp;
-	if (pmd_protnone(entry))
-		entry = pmd_mknonnuma(entry);
 	set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
 	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 }
-- 
cgit v1.2.3


From 21d9ee3eda7792c45880b2f11bff8e95c9a061fb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:32 -0800
Subject: mm: remove remaining references to NUMA hinting bits and helpers

This patch removes the NUMA PTE bits and associated helpers.  As a
side-effect it increases the maximum possible swap space on x86-64.

One potential source of problems is races between the marking of PTEs
PROT_NONE, NUMA hinting faults and migration.  It must be guaranteed that
a PTE being protected is not faulted in parallel, seen as a pte_none and
corrupting memory.  The base case is safe but transhuge has problems in
the past due to an different migration mechanism and a dependance on page
lock to serialise migrations and warrants a closer look.

task_work hinting update			parallel fault
------------------------			--------------
change_pmd_range
  change_huge_pmd
    __pmd_trans_huge_lock
      pmdp_get_and_clear
						__handle_mm_fault
						pmd_none
						  do_huge_pmd_anonymous_page
						  read? pmd_lock blocks until hinting complete, fail !pmd_none test
						  write? __do_huge_pmd_anonymous_page acquires pmd_lock, checks pmd_none
      pmd_modify
      set_pmd_at

task_work hinting update			parallel migration
------------------------			------------------
change_pmd_range
  change_huge_pmd
    __pmd_trans_huge_lock
      pmdp_get_and_clear
						__handle_mm_fault
						  do_huge_pmd_numa_page
						    migrate_misplaced_transhuge_page
						    pmd_lock waits for updates to complete, recheck pmd_same
      pmd_modify
      set_pmd_at

Both of those are safe and the case where a transhuge page is inserted
during a protection update is unchanged.  The case where two processes try
migrating at the same time is unchanged by this series so should still be
ok.  I could not find a case where we are accidentally depending on the
PTE not being cleared and flushed.  If one is missed, it'll manifest as
corruption problems that start triggering shortly after this series is
merged and only happen when NUMA balancing is enabled.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/include/asm/pgtable.h    |  54 +-----------
 arch/powerpc/include/asm/pte-common.h |   5 --
 arch/powerpc/include/asm/pte-hash64.h |   6 --
 arch/x86/include/asm/pgtable.h        |  22 +----
 arch/x86/include/asm/pgtable_64.h     |   5 --
 arch/x86/include/asm/pgtable_types.h  |  41 +--------
 include/asm-generic/pgtable.h         | 155 ----------------------------------
 include/linux/swapops.h               |   2 +-
 8 files changed, 7 insertions(+), 283 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 1146006d3477..79fee2eb8d56 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -55,64 +55,12 @@ static inline int pmd_protnone(pmd_t pmd)
 {
 	return pte_protnone(pmd_pte(pmd));
 }
-
-static inline int pte_present(pte_t pte)
-{
-	return pte_val(pte) & _PAGE_NUMA_MASK;
-}
-
-#define pte_present_nonuma pte_present_nonuma
-static inline int pte_present_nonuma(pte_t pte)
-{
-	return pte_val(pte) & (_PAGE_PRESENT);
-}
-
-#define ptep_set_numa ptep_set_numa
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pte_t *ptep)
-{
-	if ((pte_val(*ptep) & _PAGE_PRESENT) == 0)
-		VM_BUG_ON(1);
-
-	pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0);
-	return;
-}
-
-#define pmdp_set_numa pmdp_set_numa
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pmd_t *pmdp)
-{
-	if ((pmd_val(*pmdp) & _PAGE_PRESENT) == 0)
-		VM_BUG_ON(1);
-
-	pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA);
-	return;
-}
-
-/*
- * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist
- * which was inherited from x86. For the purposes of powerpc pte_basic_t and
- * pmd_t are equivalent
- */
-#define pteval_t pte_basic_t
-#define pmdval_t pmd_t
-static inline pteval_t ptenuma_flags(pte_t pte)
-{
-	return pte_val(pte) & _PAGE_NUMA_MASK;
-}
-
-static inline pmdval_t pmdnuma_flags(pmd_t pmd)
-{
-	return pmd_val(pmd) & _PAGE_NUMA_MASK;
-}
-
-# else
+#endif /* CONFIG_NUMA_BALANCING */
 
 static inline int pte_present(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_PRESENT;
 }
-#endif /* CONFIG_NUMA_BALANCING */
 
 /* Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index 2aef9b7a0eb2..c5a755ef7011 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -104,11 +104,6 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
 			 _PAGE_USER | _PAGE_ACCESSED | _PAGE_RO | \
 			 _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC)
 
-#ifdef CONFIG_NUMA_BALANCING
-/* Mask of bits that distinguish present and numa ptes */
-#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT)
-#endif
-
 /*
  * We define 2 sets of base prot bits, one for basic pages (ie,
  * cacheable kernel and user pages) and one for non cacheable
diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h
index 2505d8eab15c..55aea0caf95e 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -27,12 +27,6 @@
 #define _PAGE_RW		0x0200 /* software: user write access allowed */
 #define _PAGE_BUSY		0x0800 /* software: PTE & hash are busy */
 
-/*
- * Used for tracking numa faults
- */
-#define _PAGE_NUMA	0x00000010 /* Gather numa placement stats */
-
-
 /* No separate kernel read-only */
 #define _PAGE_KERNEL_RW		(_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */
 #define _PAGE_KERNEL_RO		 _PAGE_KERNEL_RW
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f519b0b529dd..34d42a7d5595 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -300,7 +300,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
 
 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 {
-	return pmd_clear_flags(pmd, _PAGE_PRESENT);
+	return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
@@ -442,13 +442,6 @@ static inline int pte_same(pte_t a, pte_t b)
 }
 
 static inline int pte_present(pte_t a)
-{
-	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
-			       _PAGE_NUMA);
-}
-
-#define pte_present_nonuma pte_present_nonuma
-static inline int pte_present_nonuma(pte_t a)
 {
 	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
@@ -459,7 +452,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
 	if (pte_flags(a) & _PAGE_PRESENT)
 		return true;
 
-	if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
+	if ((pte_flags(a) & _PAGE_PROTNONE) &&
 			mm_tlb_flush_pending(mm))
 		return true;
 
@@ -479,8 +472,7 @@ static inline int pmd_present(pmd_t pmd)
 	 * the _PAGE_PSE flag will remain set at all times while the
 	 * _PAGE_PRESENT bit is clear).
 	 */
-	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
-				 _PAGE_NUMA);
+	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
 }
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -555,11 +547,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
 
 static inline int pmd_bad(pmd_t pmd)
 {
-#ifdef CONFIG_NUMA_BALANCING
-	/* pmd_numa check */
-	if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
-		return 0;
-#endif
 	return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
 }
 
@@ -878,19 +865,16 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 
 static inline int pte_swp_soft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
 }
 
 static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 #endif
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e227970f983e..2ee781114d34 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -142,12 +142,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 
 /* Encode and de-code a swap entry */
 #define SWP_TYPE_BITS 5
-#ifdef CONFIG_NUMA_BALANCING
-/* Automatic NUMA balancing needs to be distinguishable from swap entries */
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
-#else
 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#endif
 
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 3e0230c94cff..8c7c10802e9c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -27,14 +27,6 @@
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
 
-/*
- * Swap offsets on configurations that allow automatic NUMA balancing use the
- * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
- * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
- * maximum possible swap space from 16TB to 8TB.
- */
-#define _PAGE_BIT_NUMA		(_PAGE_BIT_GLOBAL+1)
-
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
 #define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
@@ -75,21 +67,6 @@
 #define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 0))
 #endif
 
-/*
- * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
- * that is not present. The hinting fault gathers numa placement statistics
- * (see pte_numa()). The bit is always zero when the PTE is not present.
- *
- * The bit picked must be always zero when the pmd is present and not
- * present, so that we don't lose information when we set it while
- * atomically clearing the present bit.
- */
-#ifdef CONFIG_NUMA_BALANCING
-#define _PAGE_NUMA	(_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
-#else
-#define _PAGE_NUMA	(_AT(pteval_t, 0))
-#endif
-
 /*
  * Tracking soft dirty bit when a page goes to a swap is tricky.
  * We need a bit which can be stored in pte _and_ not conflict
@@ -122,8 +99,8 @@
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |		\
 			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |	\
-			 _PAGE_SOFT_DIRTY | _PAGE_NUMA)
-#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
+			 _PAGE_SOFT_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
 /*
  * The cache modes defined here are used to translate between pure SW usage
@@ -324,20 +301,6 @@ static inline pteval_t pte_flags(pte_t pte)
 	return native_pte_val(pte) & PTE_FLAGS_MASK;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-/* Set of bits that distinguishes present, prot_none and numa ptes */
-#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
-static inline pteval_t ptenuma_flags(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_NUMA_MASK;
-}
-
-static inline pmdval_t pmdnuma_flags(pmd_t pmd)
-{
-	return pmd_flags(pmd) & _PAGE_NUMA_MASK;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 #define pgprot_val(x)	((x).pgprot)
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 067922c06c29..4d46085c1b90 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -244,10 +244,6 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 # define pte_accessible(mm, pte)	((void)(pte), 1)
 #endif
 
-#ifndef pte_present_nonuma
-#define pte_present_nonuma(pte) pte_present(pte)
-#endif
-
 #ifndef flush_tlb_fix_spurious_fault
 #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
 #endif
@@ -693,157 +689,6 @@ static inline int pmd_protnone(pmd_t pmd)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-#ifdef CONFIG_NUMA_BALANCING
-/*
- * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that
- * is protected for PROT_NONE and a NUMA hinting fault entry. If the
- * architecture defines __PAGE_PROTNONE then it should take that into account
- * but those that do not can rely on the fact that the NUMA hinting scanner
- * skips inaccessible VMAs.
- *
- * pte/pmd_present() returns true if pte/pmd_numa returns true. Page
- * fault triggers on those regions if pte/pmd_numa returns true
- * (because _PAGE_PRESENT is not set).
- */
-#ifndef pte_numa
-static inline int pte_numa(pte_t pte)
-{
-	return ptenuma_flags(pte) == _PAGE_NUMA;
-}
-#endif
-
-#ifndef pmd_numa
-static inline int pmd_numa(pmd_t pmd)
-{
-	return pmdnuma_flags(pmd) == _PAGE_NUMA;
-}
-#endif
-
-/*
- * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically
- * because they're called by the NUMA hinting minor page fault. If we
- * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler
- * would be forced to set it later while filling the TLB after we
- * return to userland. That would trigger a second write to memory
- * that we optimize away by setting _PAGE_ACCESSED here.
- */
-#ifndef pte_mknonnuma
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
-	pteval_t val = pte_val(pte);
-
-	val &= ~_PAGE_NUMA;
-	val |= (_PAGE_PRESENT|_PAGE_ACCESSED);
-	return __pte(val);
-}
-#endif
-
-#ifndef pmd_mknonnuma
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
-{
-	pmdval_t val = pmd_val(pmd);
-
-	val &= ~_PAGE_NUMA;
-	val |= (_PAGE_PRESENT|_PAGE_ACCESSED);
-
-	return __pmd(val);
-}
-#endif
-
-#ifndef pte_mknuma
-static inline pte_t pte_mknuma(pte_t pte)
-{
-	pteval_t val = pte_val(pte);
-
-	VM_BUG_ON(!(val & _PAGE_PRESENT));
-
-	val &= ~_PAGE_PRESENT;
-	val |= _PAGE_NUMA;
-
-	return __pte(val);
-}
-#endif
-
-#ifndef ptep_set_numa
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pte_t *ptep)
-{
-	pte_t ptent = *ptep;
-
-	ptent = pte_mknuma(ptent);
-	set_pte_at(mm, addr, ptep, ptent);
-	return;
-}
-#endif
-
-#ifndef pmd_mknuma
-static inline pmd_t pmd_mknuma(pmd_t pmd)
-{
-	pmdval_t val = pmd_val(pmd);
-
-	val &= ~_PAGE_PRESENT;
-	val |= _PAGE_NUMA;
-
-	return __pmd(val);
-}
-#endif
-
-#ifndef pmdp_set_numa
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pmd_t *pmdp)
-{
-	pmd_t pmd = *pmdp;
-
-	pmd = pmd_mknuma(pmd);
-	set_pmd_at(mm, addr, pmdp, pmd);
-	return;
-}
-#endif
-#else
-static inline int pmd_numa(pmd_t pmd)
-{
-	return 0;
-}
-
-static inline int pte_numa(pte_t pte)
-{
-	return 0;
-}
-
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
-	return pte;
-}
-
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
-{
-	return pmd;
-}
-
-static inline pte_t pte_mknuma(pte_t pte)
-{
-	return pte;
-}
-
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pte_t *ptep)
-{
-	return;
-}
-
-
-static inline pmd_t pmd_mknuma(pmd_t pmd)
-{
-	return pmd;
-}
-
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pmd_t *pmdp)
-{
-	return ;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 #endif /* CONFIG_MMU */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 831a3168ab35..cedf3d3c373f 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 /* check whether a pte points to a swap entry */
 static inline int is_swap_pte(pte_t pte)
 {
-	return !pte_none(pte) && !pte_present_nonuma(pte);
+	return !pte_none(pte) && !pte_present(pte);
 }
 #endif
 
-- 
cgit v1.2.3


From e944fd67b625c02bda4a78ddf85e413c5e401474 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:35 -0800
Subject: mm: numa: do not trap faults on the huge zero page

Faults on the huge zero page are pointless and there is a BUG_ON to catch
them during fault time.  This patch reintroduces a check that avoids
marking the zero page PAGE_NONE.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h |  3 ++-
 mm/huge_memory.c        | 13 ++++++++++++-
 mm/memory.c             |  1 -
 mm/mprotect.c           | 14 +++++++++++++-
 4 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 062bd252e994..f10b20f05159 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -31,7 +31,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
 			 unsigned long new_addr, unsigned long old_end,
 			 pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-			unsigned long addr, pgprot_t newprot);
+			unsigned long addr, pgprot_t newprot,
+			int prot_numa);
 
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_FLAG,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cb9b3e847dac..8e791a3db6b6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1471,7 +1471,7 @@ out:
  *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
  */
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-		unsigned long addr, pgprot_t newprot)
+		unsigned long addr, pgprot_t newprot, int prot_numa)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	spinlock_t *ptl;
@@ -1479,6 +1479,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 
 	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		pmd_t entry;
+
+		/*
+		 * Avoid trapping faults against the zero page. The read-only
+		 * data is likely to be read-cached on the local CPU and
+		 * local/remote hits to the zero page are not interesting.
+		 */
+		if (prot_numa && is_huge_zero_pmd(*pmd)) {
+			spin_unlock(ptl);
+			return 0;
+		}
+
 		ret = 1;
 		entry = pmdp_get_and_clear_notify(mm, addr, pmd);
 		entry = pmd_modify(entry, newprot);
diff --git a/mm/memory.c b/mm/memory.c
index d7921760cf79..bf244f56b05a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3040,7 +3040,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		pte_unmap_unlock(ptep, ptl);
 		return 0;
 	}
-	BUG_ON(is_zero_pfn(page_to_pfn(page)));
 
 	/*
 	 * Avoid grouping on DSO/COW pages in specific and RO pages
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 76824d73380d..dd599fc235c2 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -76,6 +76,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (pte_present(oldpte)) {
 			pte_t ptent;
 
+			/*
+			 * Avoid trapping faults against the zero or KSM
+			 * pages. See similar comment in change_huge_pmd.
+			 */
+			if (prot_numa) {
+				struct page *page;
+
+				page = vm_normal_page(vma, addr, oldpte);
+				if (!page || PageKsm(page))
+					continue;
+			}
+
 			ptent = ptep_modify_prot_start(mm, addr, pte);
 			ptent = pte_modify(ptent, newprot);
 
@@ -142,7 +154,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 				split_huge_page_pmd(vma, addr, pmd);
 			else {
 				int nr_ptes = change_huge_pmd(vma, pmd, addr,
-						newprot);
+						newprot, prot_numa);
 
 				if (nr_ptes) {
 					if (nr_ptes == HPAGE_PMD_NR) {
-- 
cgit v1.2.3


From 503c358cf1925853195ee39ec437e51138bbb7df Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:58:47 -0800
Subject: list_lru: introduce list_lru_shrink_{count,walk}

Kmem accounting of memcg is unusable now, because it lacks slab shrinker
support.  That means when we hit the limit we will get ENOMEM w/o any
chance to recover.  What we should do then is to call shrink_slab, which
would reclaim old inode/dentry caches from this cgroup.  This is what
this patch set is intended to do.

Basically, it does two things.  First, it introduces the notion of
per-memcg slab shrinker.  A shrinker that wants to reclaim objects per
cgroup should mark itself as SHRINKER_MEMCG_AWARE.  Then it will be
passed the memory cgroup to scan from in shrink_control->memcg.  For
such shrinkers shrink_slab iterates over the whole cgroup subtree under
the target cgroup and calls the shrinker for each kmem-active memory
cgroup.

Secondly, this patch set makes the list_lru structure per-memcg.  It's
done transparently to list_lru users - everything they have to do is to
tell list_lru_init that they want memcg-aware list_lru.  Then the
list_lru will automatically distribute objects among per-memcg lists
basing on which cgroup the object is accounted to.  This way to make FS
shrinkers (icache, dcache) memcg-aware we only need to make them use
memcg-aware list_lru, and this is what this patch set does.

As before, this patch set only enables per-memcg kmem reclaim when the
pressure goes from memory.limit, not from memory.kmem.limit.  Handling
memory.kmem.limit is going to be tricky due to GFP_NOFS allocations, and
it is still unclear whether we will have this knob in the unified
hierarchy.

This patch (of 9):

NUMA aware slab shrinkers use the list_lru structure to distribute
objects coming from different NUMA nodes to different lists.  Whenever
such a shrinker needs to count or scan objects from a particular node,
it issues commands like this:

        count = list_lru_count_node(lru, sc->nid);
        freed = list_lru_walk_node(lru, sc->nid, isolate_func,
                                   isolate_arg, &sc->nr_to_scan);

where sc is an instance of the shrink_control structure passed to it
from vmscan.

To simplify this, let's add special list_lru functions to be used by
shrinkers, list_lru_shrink_count() and list_lru_shrink_walk(), which
consolidate the nid and nr_to_scan arguments in the shrink_control
structure.

This will also allow us to avoid patching shrinkers that use list_lru
when we make shrink_slab() per-memcg - all we will have to do is extend
the shrink_control structure to include the target memcg and make
list_lru_shrink_{count,walk} handle this appropriately.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Suggested-by: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c              | 14 ++++++--------
 fs/gfs2/quota.c          |  6 +++---
 fs/inode.c               |  7 +++----
 fs/internal.h            |  7 +++----
 fs/super.c               | 24 +++++++++++-------------
 fs/xfs/xfs_buf.c         |  7 +++----
 fs/xfs/xfs_qm.c          |  7 +++----
 include/linux/list_lru.h | 16 ++++++++++++++++
 mm/workingset.c          |  6 +++---
 9 files changed, 51 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index e368d4f412f9..56c5da89f58a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -930,24 +930,22 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 /**
  * prune_dcache_sb - shrink the dcache
  * @sb: superblock
- * @nr_to_scan : number of entries to try to free
- * @nid: which node to scan for freeable entities
+ * @sc: shrink control, passed to list_lru_shrink_walk()
  *
- * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
- * done when we need more memory an called from the superblock shrinker
+ * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
+ * is done when we need more memory and called from the superblock shrinker
  * function.
  *
  * This function may fail to free any resources if all the dentries are in
  * use.
  */
-long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
-		     int nid)
+long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
 {
 	LIST_HEAD(dispose);
 	long freed;
 
-	freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate,
-				       &dispose, &nr_to_scan);
+	freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
+				     dentry_lru_isolate, &dispose);
 	shrink_dentry_list(&dispose);
 	return freed;
 }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e193cb36996..c15d6b216d0b 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -171,8 +171,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
 	if (!(sc->gfp_mask & __GFP_FS))
 		return SHRINK_STOP;
 
-	freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate,
-				   &dispose, &sc->nr_to_scan);
+	freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
+				     gfs2_qd_isolate, &dispose);
 
 	gfs2_qd_dispose(&dispose);
 
@@ -182,7 +182,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
 static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
 					  struct shrink_control *sc)
 {
-	return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid));
+	return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
 }
 
 struct shrinker gfs2_qd_shrinker = {
diff --git a/fs/inode.c b/fs/inode.c
index 3a53b1da3fb8..524a32c2b0c6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -751,14 +751,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
  * to trim from the LRU. Inodes to be freed are moved to a temporary list and
  * then are freed outside inode_lock by dispose_list().
  */
-long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
-		     int nid)
+long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
 {
 	LIST_HEAD(freeable);
 	long freed;
 
-	freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate,
-				       &freeable, &nr_to_scan);
+	freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
+				     inode_lru_isolate, &freeable);
 	dispose_list(&freeable);
 	return freed;
 }
diff --git a/fs/internal.h b/fs/internal.h
index e9a61fe67575..d92c346a793d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -14,6 +14,7 @@ struct file_system_type;
 struct linux_binprm;
 struct path;
 struct mount;
+struct shrink_control;
 
 /*
  * block_dev.c
@@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f);
  * inode.c
  */
 extern spinlock_t inode_sb_list_lock;
-extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
-			    int nid);
+extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
 extern void inode_add_lru(struct inode *inode);
 
 /*
@@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool);
  */
 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
 extern int d_set_mounted(struct dentry *dentry);
-extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
-			    int nid);
+extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
 
 /*
  * read_write.c
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..4554ac257647 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -77,8 +77,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 	if (sb->s_op->nr_cached_objects)
 		fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
 
-	inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid);
-	dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid);
+	inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
+	dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
 	total_objects = dentries + inodes + fs_objects + 1;
 	if (!total_objects)
 		total_objects = 1;
@@ -86,20 +86,20 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 	/* proportion the scan between the caches */
 	dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
 	inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
+	fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
 
 	/*
 	 * prune the dcache first as the icache is pinned by it, then
 	 * prune the icache, followed by the filesystem specific caches
 	 */
-	freed = prune_dcache_sb(sb, dentries, sc->nid);
-	freed += prune_icache_sb(sb, inodes, sc->nid);
+	sc->nr_to_scan = dentries;
+	freed = prune_dcache_sb(sb, sc);
+	sc->nr_to_scan = inodes;
+	freed += prune_icache_sb(sb, sc);
 
-	if (fs_objects) {
-		fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
-								total_objects);
+	if (fs_objects)
 		freed += sb->s_op->free_cached_objects(sb, fs_objects,
 						       sc->nid);
-	}
 
 	drop_super(sb);
 	return freed;
@@ -118,17 +118,15 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	 * scalability bottleneck. The counts could get updated
 	 * between super_cache_count and super_cache_scan anyway.
 	 * Call to super_cache_count with shrinker_rwsem held
-	 * ensures the safety of call to list_lru_count_node() and
+	 * ensures the safety of call to list_lru_shrink_count() and
 	 * s_op->nr_cached_objects().
 	 */
 	if (sb->s_op && sb->s_op->nr_cached_objects)
 		total_objects = sb->s_op->nr_cached_objects(sb,
 						 sc->nid);
 
-	total_objects += list_lru_count_node(&sb->s_dentry_lru,
-						 sc->nid);
-	total_objects += list_lru_count_node(&sb->s_inode_lru,
-						 sc->nid);
+	total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
+	total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
 
 	total_objects = vfs_pressure_ratio(total_objects);
 	return total_objects;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index bb502a391792..15c9d224c721 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1583,10 +1583,9 @@ xfs_buftarg_shrink_scan(
 					struct xfs_buftarg, bt_shrinker);
 	LIST_HEAD(dispose);
 	unsigned long		freed;
-	unsigned long		nr_to_scan = sc->nr_to_scan;
 
-	freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
-				       &dispose, &nr_to_scan);
+	freed = list_lru_shrink_walk(&btp->bt_lru, sc,
+				     xfs_buftarg_isolate, &dispose);
 
 	while (!list_empty(&dispose)) {
 		struct xfs_buf *bp;
@@ -1605,7 +1604,7 @@ xfs_buftarg_shrink_count(
 {
 	struct xfs_buftarg	*btp = container_of(shrink,
 					struct xfs_buftarg, bt_shrinker);
-	return list_lru_count_node(&btp->bt_lru, sc->nid);
+	return list_lru_shrink_count(&btp->bt_lru, sc);
 }
 
 void
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 3e8186279541..4f4b1274e144 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -523,7 +523,6 @@ xfs_qm_shrink_scan(
 	struct xfs_qm_isolate	isol;
 	unsigned long		freed;
 	int			error;
-	unsigned long		nr_to_scan = sc->nr_to_scan;
 
 	if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
 		return 0;
@@ -531,8 +530,8 @@ xfs_qm_shrink_scan(
 	INIT_LIST_HEAD(&isol.buffers);
 	INIT_LIST_HEAD(&isol.dispose);
 
-	freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol,
-					&nr_to_scan);
+	freed = list_lru_shrink_walk(&qi->qi_lru, sc,
+				     xfs_qm_dquot_isolate, &isol);
 
 	error = xfs_buf_delwri_submit(&isol.buffers);
 	if (error)
@@ -557,7 +556,7 @@ xfs_qm_shrink_count(
 	struct xfs_quotainfo	*qi = container_of(shrink,
 					struct xfs_quotainfo, qi_shrinker);
 
-	return list_lru_count_node(&qi->qi_lru, sc->nid);
+	return list_lru_shrink_count(&qi->qi_lru, sc);
 }
 
 /*
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index f3434533fbf8..f500a2e39b13 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -9,6 +9,7 @@
 
 #include <linux/list.h>
 #include <linux/nodemask.h>
+#include <linux/shrinker.h>
 
 /* list_lru_walk_cb has to always return one of those */
 enum lru_status {
@@ -81,6 +82,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item);
  * Callers that want such a guarantee need to provide an outer lock.
  */
 unsigned long list_lru_count_node(struct list_lru *lru, int nid);
+
+static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
+						  struct shrink_control *sc)
+{
+	return list_lru_count_node(lru, sc->nid);
+}
+
 static inline unsigned long list_lru_count(struct list_lru *lru)
 {
 	long count = 0;
@@ -119,6 +127,14 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
 				 list_lru_walk_cb isolate, void *cb_arg,
 				 unsigned long *nr_to_walk);
 
+static inline unsigned long
+list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+		     list_lru_walk_cb isolate, void *cb_arg)
+{
+	return list_lru_walk_node(lru, sc->nid, isolate, cb_arg,
+				  &sc->nr_to_scan);
+}
+
 static inline unsigned long
 list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
 	      void *cb_arg, unsigned long nr_to_walk)
diff --git a/mm/workingset.c b/mm/workingset.c
index f7216fa7da27..d4fa7fb10a52 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 
 	/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
 	local_irq_disable();
-	shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
+	shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
 	local_irq_enable();
 
 	pages = node_present_pages(sc->nid);
@@ -376,8 +376,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
 
 	/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
 	local_irq_disable();
-	ret =  list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
-				  shadow_lru_isolate, NULL, &sc->nr_to_scan);
+	ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
+				    shadow_lru_isolate, NULL);
 	local_irq_enable();
 	return ret;
 }
-- 
cgit v1.2.3


From 4101b624352fddb5ed72e7a1b6f8be8cffaa20fa Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:58:51 -0800
Subject: fs: consolidate {nr,free}_cached_objects args in shrink_control

We are going to make FS shrinkers memcg-aware.  To achieve that, we will
have to pass the memcg to scan to the nr_cached_objects and
free_cached_objects VFS methods, which currently take only the NUMA node
to scan.  Since the shrink_control structure already holds the node, and
the memcg to scan will be added to it when we introduce memcg-aware
vmscan, let us consolidate the methods' arguments in this structure to
keep things clean.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Suggested-by: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c         | 12 ++++++------
 fs/xfs/xfs_super.c |  7 +++----
 include/linux/fs.h |  6 ++++--
 3 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 4554ac257647..a2b735a42e74 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -75,7 +75,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 		return SHRINK_STOP;
 
 	if (sb->s_op->nr_cached_objects)
-		fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
+		fs_objects = sb->s_op->nr_cached_objects(sb, sc);
 
 	inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
 	dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
@@ -97,9 +97,10 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 	sc->nr_to_scan = inodes;
 	freed += prune_icache_sb(sb, sc);
 
-	if (fs_objects)
-		freed += sb->s_op->free_cached_objects(sb, fs_objects,
-						       sc->nid);
+	if (fs_objects) {
+		sc->nr_to_scan = fs_objects;
+		freed += sb->s_op->free_cached_objects(sb, sc);
+	}
 
 	drop_super(sb);
 	return freed;
@@ -122,8 +123,7 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	 * s_op->nr_cached_objects().
 	 */
 	if (sb->s_op && sb->s_op->nr_cached_objects)
-		total_objects = sb->s_op->nr_cached_objects(sb,
-						 sc->nid);
+		total_objects = sb->s_op->nr_cached_objects(sb, sc);
 
 	total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
 	total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f2449fd86926..8fcc4ccc5c79 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1537,7 +1537,7 @@ xfs_fs_mount(
 static long
 xfs_fs_nr_cached_objects(
 	struct super_block	*sb,
-	int			nid)
+	struct shrink_control	*sc)
 {
 	return xfs_reclaim_inodes_count(XFS_M(sb));
 }
@@ -1545,10 +1545,9 @@ xfs_fs_nr_cached_objects(
 static long
 xfs_fs_free_cached_objects(
 	struct super_block	*sb,
-	long			nr_to_scan,
-	int			nid)
+	struct shrink_control	*sc)
 {
-	return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+	return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
 }
 
 static const struct super_operations xfs_super_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cdcb1e9d9613..a20d6586e517 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1635,8 +1635,10 @@ struct super_operations {
 	struct dquot **(*get_dquots)(struct inode *);
 #endif
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
-	long (*nr_cached_objects)(struct super_block *, int);
-	long (*free_cached_objects)(struct super_block *, long, int);
+	long (*nr_cached_objects)(struct super_block *,
+				  struct shrink_control *);
+	long (*free_cached_objects)(struct super_block *,
+				    struct shrink_control *);
 };
 
 /*
-- 
cgit v1.2.3


From cb731d6c62bbc2f890b08ea3d0386d5dad887326 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:58:54 -0800
Subject: vmscan: per memory cgroup slab shrinkers

This patch adds SHRINKER_MEMCG_AWARE flag.  If a shrinker has this flag
set, it will be called per memory cgroup.  The memory cgroup to scan
objects from is passed in shrink_control->memcg.  If the memory cgroup
is NULL, a memcg aware shrinker is supposed to scan objects from the
global list.  Unaware shrinkers are only called on global pressure with
memcg=NULL.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/drop_caches.c           | 14 --------
 include/linux/memcontrol.h |  7 ++++
 include/linux/mm.h         |  5 ++-
 include/linux/shrinker.h   |  6 +++-
 mm/memcontrol.c            |  2 +-
 mm/memory-failure.c        | 11 ++----
 mm/vmscan.c                | 85 ++++++++++++++++++++++++++++++++++------------
 7 files changed, 80 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2bc2c87f35e7..5718cb9f7273 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -37,20 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 	iput(toput_inode);
 }
 
-static void drop_slab(void)
-{
-	int nr_objects;
-
-	do {
-		int nid;
-
-		nr_objects = 0;
-		for_each_online_node(nid)
-			nr_objects += shrink_node_slabs(GFP_KERNEL, nid,
-							1000, 1000);
-	} while (nr_objects > 10);
-}
-
 int drop_caches_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6cfd934c7c9b..54992fe0959f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -413,6 +413,8 @@ static inline bool memcg_kmem_enabled(void)
 	return static_key_false(&memcg_kmem_enabled_key);
 }
 
+bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+
 /*
  * In general, we'll do everything in our power to not incur in any overhead
  * for non-memcg users for the kmem functions. Not even a function call, if we
@@ -542,6 +544,11 @@ static inline bool memcg_kmem_enabled(void)
 	return false;
 }
 
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+	return false;
+}
+
 static inline bool
 memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a4d24f3c5430..af4ff88a11e0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2168,9 +2168,8 @@ int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 #endif
 
-unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
-				unsigned long nr_scanned,
-				unsigned long nr_eligible);
+void drop_slab(void);
+void drop_slab_node(int nid);
 
 #ifndef CONFIG_MMU
 #define randomize_va_space 0
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index f4aee75f00b1..4fcacd915d45 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -20,6 +20,9 @@ struct shrink_control {
 
 	/* current node being shrunk (for NUMA aware shrinkers) */
 	int nid;
+
+	/* current memcg being shrunk (for memcg aware shrinkers) */
+	struct mem_cgroup *memcg;
 };
 
 #define SHRINK_STOP (~0UL)
@@ -61,7 +64,8 @@ struct shrinker {
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
 
 /* Flags */
-#define SHRINKER_NUMA_AWARE (1 << 0)
+#define SHRINKER_NUMA_AWARE	(1 << 0)
+#define SHRINKER_MEMCG_AWARE	(1 << 1)
 
 extern int register_shrinker(struct shrinker *);
 extern void unregister_shrinker(struct shrinker *);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 095c1f96fbec..3c2a1a8286ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -352,7 +352,7 @@ struct mem_cgroup {
 };
 
 #ifdef CONFIG_MEMCG_KMEM
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
 	return memcg->kmemcg_id >= 0;
 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index feb803bf3443..1a735fad2a13 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -242,15 +242,8 @@ void shake_page(struct page *p, int access)
 	 * Only call shrink_node_slabs here (which would also shrink
 	 * other caches) if access is not potentially fatal.
 	 */
-	if (access) {
-		int nr;
-		int nid = page_to_nid(p);
-		do {
-			nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
-			if (page_count(p) == 1)
-				break;
-		} while (nr > 10);
-	}
+	if (access)
+		drop_slab_node(page_to_nid(p));
 }
 EXPORT_SYMBOL_GPL(shake_page);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8e645ee52045..803886b8e353 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -232,10 +232,10 @@ EXPORT_SYMBOL(unregister_shrinker);
 
 #define SHRINK_BATCH 128
 
-static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
-				  struct shrinker *shrinker,
-				  unsigned long nr_scanned,
-				  unsigned long nr_eligible)
+static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
+				    struct shrinker *shrinker,
+				    unsigned long nr_scanned,
+				    unsigned long nr_eligible)
 {
 	unsigned long freed = 0;
 	unsigned long long delta;
@@ -344,9 +344,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
 }
 
 /**
- * shrink_node_slabs - shrink slab caches of a given node
+ * shrink_slab - shrink slab caches
  * @gfp_mask: allocation context
  * @nid: node whose slab caches to target
+ * @memcg: memory cgroup whose slab caches to target
  * @nr_scanned: pressure numerator
  * @nr_eligible: pressure denominator
  *
@@ -355,6 +356,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
  * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
  * unaware shrinkers will receive a node id of 0 instead.
  *
+ * @memcg specifies the memory cgroup to target. If it is not NULL,
+ * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
+ * objects from the memory cgroup specified. Otherwise all shrinkers
+ * are called, and memcg aware shrinkers are supposed to scan the
+ * global list then.
+ *
  * @nr_scanned and @nr_eligible form a ratio that indicate how much of
  * the available objects should be scanned.  Page reclaim for example
  * passes the number of pages scanned and the number of pages on the
@@ -365,13 +372,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
  *
  * Returns the number of reclaimed slab objects.
  */
-unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
-				unsigned long nr_scanned,
-				unsigned long nr_eligible)
+static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
+				 struct mem_cgroup *memcg,
+				 unsigned long nr_scanned,
+				 unsigned long nr_eligible)
 {
 	struct shrinker *shrinker;
 	unsigned long freed = 0;
 
+	if (memcg && !memcg_kmem_is_active(memcg))
+		return 0;
+
 	if (nr_scanned == 0)
 		nr_scanned = SWAP_CLUSTER_MAX;
 
@@ -390,12 +401,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
 		struct shrink_control sc = {
 			.gfp_mask = gfp_mask,
 			.nid = nid,
+			.memcg = memcg,
 		};
 
+		if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+			continue;
+
 		if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
 			sc.nid = 0;
 
-		freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible);
+		freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
 	}
 
 	up_read(&shrinker_rwsem);
@@ -404,6 +419,29 @@ out:
 	return freed;
 }
 
+void drop_slab_node(int nid)
+{
+	unsigned long freed;
+
+	do {
+		struct mem_cgroup *memcg = NULL;
+
+		freed = 0;
+		do {
+			freed += shrink_slab(GFP_KERNEL, nid, memcg,
+					     1000, 1000);
+		} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+	} while (freed > 10);
+}
+
+void drop_slab(void)
+{
+	int nid;
+
+	for_each_online_node(nid)
+		drop_slab_node(nid);
+}
+
 static inline int is_page_cache_freeable(struct page *page)
 {
 	/*
@@ -2276,6 +2314,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
 static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 			bool is_classzone)
 {
+	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long nr_reclaimed, nr_scanned;
 	bool reclaimable = false;
 
@@ -2294,6 +2333,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 		memcg = mem_cgroup_iter(root, NULL, &reclaim);
 		do {
 			unsigned long lru_pages;
+			unsigned long scanned;
 			struct lruvec *lruvec;
 			int swappiness;
 
@@ -2305,10 +2345,16 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 			swappiness = mem_cgroup_swappiness(memcg);
+			scanned = sc->nr_scanned;
 
 			shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
 			zone_lru_pages += lru_pages;
 
+			if (memcg && is_classzone)
+				shrink_slab(sc->gfp_mask, zone_to_nid(zone),
+					    memcg, sc->nr_scanned - scanned,
+					    lru_pages);
+
 			/*
 			 * Direct reclaim and kswapd have to scan all memory
 			 * cgroups to fulfill the overall scan target for the
@@ -2330,19 +2376,14 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 		 * Shrink the slab caches in the same proportion that
 		 * the eligible LRU pages were scanned.
 		 */
-		if (global_reclaim(sc) && is_classzone) {
-			struct reclaim_state *reclaim_state;
-
-			shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone),
-					  sc->nr_scanned - nr_scanned,
-					  zone_lru_pages);
-
-			reclaim_state = current->reclaim_state;
-			if (reclaim_state) {
-				sc->nr_reclaimed +=
-					reclaim_state->reclaimed_slab;
-				reclaim_state->reclaimed_slab = 0;
-			}
+		if (global_reclaim(sc) && is_classzone)
+			shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
+				    sc->nr_scanned - nr_scanned,
+				    zone_lru_pages);
+
+		if (reclaim_state) {
+			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+			reclaim_state->reclaimed_slab = 0;
 		}
 
 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
-- 
cgit v1.2.3


From dbcf73e26cd0b3d66e6db65ab595e664a55e58ff Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:58:57 -0800
Subject: memcg: rename some cache id related variables

memcg_limited_groups_array_size, which defines the size of memcg_caches
arrays, sounds rather cumbersome.  Also it doesn't point anyhow that
it's related to kmem/caches stuff.  So let's rename it to
memcg_nr_cache_ids.  It's concise and points us directly to
memcg_cache_id.

Also, rename kmem_limited_groups to memcg_cache_ida.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  4 ++--
 mm/memcontrol.c            | 19 +++++++++----------
 mm/slab_common.c           |  4 ++--
 3 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 54992fe0959f..2607c91230af 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -398,7 +398,7 @@ static inline void sock_release_memcg(struct sock *sk)
 #ifdef CONFIG_MEMCG_KMEM
 extern struct static_key memcg_kmem_enabled_key;
 
-extern int memcg_limited_groups_array_size;
+extern int memcg_nr_cache_ids;
 
 /*
  * Helper macro to loop through all memcg-specific caches. Callers must still
@@ -406,7 +406,7 @@ extern int memcg_limited_groups_array_size;
  * the slab_mutex must be held when looping through those caches
  */
 #define for_each_memcg_cache_index(_idx)	\
-	for ((_idx) = 0; (_idx) < memcg_limited_groups_array_size; (_idx)++)
+	for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)
 
 static inline bool memcg_kmem_enabled(void)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3c2a1a8286ac..8608fa543b84 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -538,12 +538,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
  *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
  *  200 entry array for that.
  *
- * The current size of the caches array is stored in
- * memcg_limited_groups_array_size.  It will double each time we have to
- * increase it.
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
+ * will double each time we have to increase it.
  */
-static DEFINE_IDA(kmem_limited_groups);
-int memcg_limited_groups_array_size;
+static DEFINE_IDA(memcg_cache_ida);
+int memcg_nr_cache_ids;
 
 /*
  * MIN_SIZE is different than 1, because we would like to avoid going through
@@ -2538,12 +2537,12 @@ static int memcg_alloc_cache_id(void)
 	int id, size;
 	int err;
 
-	id = ida_simple_get(&kmem_limited_groups,
+	id = ida_simple_get(&memcg_cache_ida,
 			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
 	if (id < 0)
 		return id;
 
-	if (id < memcg_limited_groups_array_size)
+	if (id < memcg_nr_cache_ids)
 		return id;
 
 	/*
@@ -2559,7 +2558,7 @@ static int memcg_alloc_cache_id(void)
 
 	err = memcg_update_all_caches(size);
 	if (err) {
-		ida_simple_remove(&kmem_limited_groups, id);
+		ida_simple_remove(&memcg_cache_ida, id);
 		return err;
 	}
 	return id;
@@ -2567,7 +2566,7 @@ static int memcg_alloc_cache_id(void)
 
 static void memcg_free_cache_id(int id)
 {
-	ida_simple_remove(&kmem_limited_groups, id);
+	ida_simple_remove(&memcg_cache_ida, id);
 }
 
 /*
@@ -2577,7 +2576,7 @@ static void memcg_free_cache_id(int id)
  */
 void memcg_update_array_size(int num)
 {
-	memcg_limited_groups_array_size = num;
+	memcg_nr_cache_ids = num;
 }
 
 struct memcg_kmem_cache_create_work {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6e1e4cf65836..f8899eedab68 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -116,7 +116,7 @@ static int memcg_alloc_cache_params(struct mem_cgroup *memcg,
 
 	if (!memcg) {
 		size = offsetof(struct memcg_cache_params, memcg_caches);
-		size += memcg_limited_groups_array_size * sizeof(void *);
+		size += memcg_nr_cache_ids * sizeof(void *);
 	} else
 		size = sizeof(struct memcg_cache_params);
 
@@ -154,7 +154,7 @@ static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs)
 
 	cur_params = s->memcg_params;
 	memcpy(new_params->memcg_caches, cur_params->memcg_caches,
-	       memcg_limited_groups_array_size * sizeof(void *));
+	       memcg_nr_cache_ids * sizeof(void *));
 
 	new_params->is_root_cache = true;
 
-- 
cgit v1.2.3


From 05257a1a3dcc196c197714b5c9a8dd35b7f6aefc Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:01 -0800
Subject: memcg: add rwsem to synchronize against memcg_caches arrays
 relocation

We need a stable value of memcg_nr_cache_ids in kmem_cache_create()
(memcg_alloc_cache_params() wants it for root caches), where we only
hold the slab_mutex and no memcg-related locks.  As a result, we have to
update memcg_nr_cache_ids under the slab_mutex, which we can only take
on the slab's side (see memcg_update_array_size).  This looks awkward
and will become even worse when per-memcg list_lru is introduced, which
also wants stable access to memcg_nr_cache_ids.

To get rid of this dependency between the memcg_nr_cache_ids and the
slab_mutex, this patch introduces a special rwsem.  The rwsem is held
for writing during memcg_caches arrays relocation and memcg_nr_cache_ids
updates.  Therefore one can take it for reading to get a stable access
to memcg_caches arrays and/or memcg_nr_cache_ids.

Currently the semaphore is taken for reading only from
kmem_cache_create, right before taking the slab_mutex, so right now
there's no much point in using rwsem instead of mutex.  However, once
list_lru is made per-memcg it will allow list_lru initializations to
proceed concurrently.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 12 ++++++++++--
 mm/memcontrol.c            | 29 +++++++++++++++++++----------
 mm/slab_common.c           |  9 ++++-----
 3 files changed, 33 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2607c91230af..dbc4baa3619c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -399,6 +399,8 @@ static inline void sock_release_memcg(struct sock *sk)
 extern struct static_key memcg_kmem_enabled_key;
 
 extern int memcg_nr_cache_ids;
+extern void memcg_get_cache_ids(void);
+extern void memcg_put_cache_ids(void);
 
 /*
  * Helper macro to loop through all memcg-specific caches. Callers must still
@@ -434,8 +436,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order);
 
 int memcg_cache_id(struct mem_cgroup *memcg);
 
-void memcg_update_array_size(int num_groups);
-
 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
 void __memcg_kmem_put_cache(struct kmem_cache *cachep);
 
@@ -569,6 +569,14 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
 	return -1;
 }
 
+static inline void memcg_get_cache_ids(void)
+{
+}
+
+static inline void memcg_put_cache_ids(void)
+{
+}
+
 static inline struct kmem_cache *
 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8608fa543b84..6706e5fa5ac0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -544,6 +544,19 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
 static DEFINE_IDA(memcg_cache_ida);
 int memcg_nr_cache_ids;
 
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+
+void memcg_get_cache_ids(void)
+{
+	down_read(&memcg_cache_ids_sem);
+}
+
+void memcg_put_cache_ids(void)
+{
+	up_read(&memcg_cache_ids_sem);
+}
+
 /*
  * MIN_SIZE is different than 1, because we would like to avoid going through
  * the alloc/free process all the time. In a small machine, 4 kmem-limited
@@ -2549,6 +2562,7 @@ static int memcg_alloc_cache_id(void)
 	 * There's no space for the new id in memcg_caches arrays,
 	 * so we have to grow them.
 	 */
+	down_write(&memcg_cache_ids_sem);
 
 	size = 2 * (id + 1);
 	if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2557,6 +2571,11 @@ static int memcg_alloc_cache_id(void)
 		size = MEMCG_CACHES_MAX_SIZE;
 
 	err = memcg_update_all_caches(size);
+	if (!err)
+		memcg_nr_cache_ids = size;
+
+	up_write(&memcg_cache_ids_sem);
+
 	if (err) {
 		ida_simple_remove(&memcg_cache_ida, id);
 		return err;
@@ -2569,16 +2588,6 @@ static void memcg_free_cache_id(int id)
 	ida_simple_remove(&memcg_cache_ida, id);
 }
 
-/*
- * We should update the current array size iff all caches updates succeed. This
- * can only be done from the slab side. The slab mutex needs to be held when
- * calling this.
- */
-void memcg_update_array_size(int num)
-{
-	memcg_nr_cache_ids = num;
-}
-
 struct memcg_kmem_cache_create_work {
 	struct mem_cgroup *memcg;
 	struct kmem_cache *cachep;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f8899eedab68..23f5fcde6043 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -169,8 +169,8 @@ int memcg_update_all_caches(int num_memcgs)
 {
 	struct kmem_cache *s;
 	int ret = 0;
-	mutex_lock(&slab_mutex);
 
+	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list) {
 		if (!is_root_cache(s))
 			continue;
@@ -181,11 +181,8 @@ int memcg_update_all_caches(int num_memcgs)
 		 * up to this point in an updated state.
 		 */
 		if (ret)
-			goto out;
+			break;
 	}
-
-	memcg_update_array_size(num_memcgs);
-out:
 	mutex_unlock(&slab_mutex);
 	return ret;
 }
@@ -369,6 +366,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
 
 	get_online_cpus();
 	get_online_mems();
+	memcg_get_cache_ids();
 
 	mutex_lock(&slab_mutex);
 
@@ -407,6 +405,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
 out_unlock:
 	mutex_unlock(&slab_mutex);
 
+	memcg_put_cache_ids();
 	put_online_mems();
 	put_online_cpus();
 
-- 
cgit v1.2.3


From ff0b67ef5b1687692bc1fd3ce4bc3d1ff83587c7 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:04 -0800
Subject: list_lru: get rid of ->active_nodes

The active_nodes mask allows us to skip empty nodes when walking over
list_lru items from all nodes in list_lru_count/walk.  However, these
functions are never called from hot paths, so it doesn't seem we need
such kind of optimization there.  OTOH, removing the mask will make it
easier to make list_lru per-memcg.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/list_lru.h |  5 ++---
 mm/list_lru.c            | 10 +++-------
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index f500a2e39b13..53c1d6b78270 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -31,7 +31,6 @@ struct list_lru_node {
 
 struct list_lru {
 	struct list_lru_node	*node;
-	nodemask_t		active_nodes;
 };
 
 void list_lru_destroy(struct list_lru *lru);
@@ -94,7 +93,7 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
 	long count = 0;
 	int nid;
 
-	for_each_node_mask(nid, lru->active_nodes)
+	for_each_node_state(nid, N_NORMAL_MEMORY)
 		count += list_lru_count_node(lru, nid);
 
 	return count;
@@ -142,7 +141,7 @@ list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
 	long isolated = 0;
 	int nid;
 
-	for_each_node_mask(nid, lru->active_nodes) {
+	for_each_node_state(nid, N_NORMAL_MEMORY) {
 		isolated += list_lru_walk_node(lru, nid, isolate,
 					       cb_arg, &nr_to_walk);
 		if (nr_to_walk <= 0)
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f1a0db194173..07e198c77888 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -19,8 +19,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
 	WARN_ON_ONCE(nlru->nr_items < 0);
 	if (list_empty(item)) {
 		list_add_tail(item, &nlru->list);
-		if (nlru->nr_items++ == 0)
-			node_set(nid, lru->active_nodes);
+		nlru->nr_items++;
 		spin_unlock(&nlru->lock);
 		return true;
 	}
@@ -37,8 +36,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 	spin_lock(&nlru->lock);
 	if (!list_empty(item)) {
 		list_del_init(item);
-		if (--nlru->nr_items == 0)
-			node_clear(nid, lru->active_nodes);
+		nlru->nr_items--;
 		WARN_ON_ONCE(nlru->nr_items < 0);
 		spin_unlock(&nlru->lock);
 		return true;
@@ -90,8 +88,7 @@ restart:
 		case LRU_REMOVED_RETRY:
 			assert_spin_locked(&nlru->lock);
 		case LRU_REMOVED:
-			if (--nlru->nr_items == 0)
-				node_clear(nid, lru->active_nodes);
+			nlru->nr_items--;
 			WARN_ON_ONCE(nlru->nr_items < 0);
 			isolated++;
 			/*
@@ -133,7 +130,6 @@ int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
 	if (!lru->node)
 		return -ENOMEM;
 
-	nodes_clear(lru->active_nodes);
 	for (i = 0; i < nr_node_ids; i++) {
 		spin_lock_init(&lru->node[i].lock);
 		if (key)
-- 
cgit v1.2.3


From c0a5b560938a0f2fd2fbf66ddc446c7c2b41383a Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:07 -0800
Subject: list_lru: organize all list_lrus to list

To make list_lru memcg aware, we need all list_lrus to be kept on a list
protected by a mutex, so that we could sleep while walking over the
list.

Therefore after this change list_lru_destroy may sleep.  Fortunately,
there is only one user that calls it from an atomic context - it's
put_super - and we can easily fix it by calling list_lru_destroy before
put_super in destroy_locked_super - anyway we don't longer need lrus by
that time.

Another point that should be noted is that list_lru_destroy is allowed
to be called on an uninitialized zeroed-out object, in which case it is
a no-op.  Before this patch this was guaranteed by kfree, but now we
need an explicit check there.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c               |  8 ++++++++
 include/linux/list_lru.h |  3 +++
 mm/list_lru.c            | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index a2b735a42e74..b027849d92d2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -282,6 +282,14 @@ void deactivate_locked_super(struct super_block *s)
 		unregister_shrinker(&s->s_shrink);
 		fs->kill_sb(s);
 
+		/*
+		 * Since list_lru_destroy() may sleep, we cannot call it from
+		 * put_super(), where we hold the sb_lock. Therefore we destroy
+		 * the lru lists right now.
+		 */
+		list_lru_destroy(&s->s_dentry_lru);
+		list_lru_destroy(&s->s_inode_lru);
+
 		put_filesystem(fs);
 		put_super(s);
 	} else {
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 53c1d6b78270..ee9486ac0621 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -31,6 +31,9 @@ struct list_lru_node {
 
 struct list_lru {
 	struct list_lru_node	*node;
+#ifdef CONFIG_MEMCG_KMEM
+	struct list_head	list;
+#endif
 };
 
 void list_lru_destroy(struct list_lru *lru);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 07e198c77888..a9021cb3ccde 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -9,6 +9,34 @@
 #include <linux/mm.h>
 #include <linux/list_lru.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
+
+#ifdef CONFIG_MEMCG_KMEM
+static LIST_HEAD(list_lrus);
+static DEFINE_MUTEX(list_lrus_mutex);
+
+static void list_lru_register(struct list_lru *lru)
+{
+	mutex_lock(&list_lrus_mutex);
+	list_add(&lru->list, &list_lrus);
+	mutex_unlock(&list_lrus_mutex);
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+	mutex_lock(&list_lrus_mutex);
+	list_del(&lru->list);
+	mutex_unlock(&list_lrus_mutex);
+}
+#else
+static void list_lru_register(struct list_lru *lru)
+{
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 
 bool list_lru_add(struct list_lru *lru, struct list_head *item)
 {
@@ -137,12 +165,18 @@ int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
 		INIT_LIST_HEAD(&lru->node[i].list);
 		lru->node[i].nr_items = 0;
 	}
+	list_lru_register(lru);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(list_lru_init_key);
 
 void list_lru_destroy(struct list_lru *lru)
 {
+	/* Already destroyed or not yet initialized? */
+	if (!lru->node)
+		return;
+	list_lru_unregister(lru);
 	kfree(lru->node);
+	lru->node = NULL;
 }
 EXPORT_SYMBOL_GPL(list_lru_destroy);
-- 
cgit v1.2.3


From 60d3fd32a7a9da4c8c93a9f89cfda22a0b4c65ce Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:10 -0800
Subject: list_lru: introduce per-memcg lists

There are several FS shrinkers, including super_block::s_shrink, that
keep reclaimable objects in the list_lru structure.  Hence to turn them
to memcg-aware shrinkers, it is enough to make list_lru per-memcg.

This patch does the trick.  It adds an array of lru lists to the
list_lru_node structure (per-node part of the list_lru), one for each
kmem-active memcg, and dispatches every item addition or removal to the
list corresponding to the memcg which the item is accounted to.  So now
the list_lru structure is not just per node, but per node and per memcg.

Not all list_lrus need this feature, so this patch also adds a new
method, list_lru_init_memcg, which initializes a list_lru as memcg
aware.  Otherwise (i.e.  if initialized with old list_lru_init), the
list_lru won't have per memcg lists.

Just like per memcg caches arrays, the arrays of per-memcg lists are
indexed by memcg_cache_id, so we must grow them whenever
memcg_nr_cache_ids is increased.  So we introduce a callback,
memcg_update_all_list_lrus, invoked by memcg_alloc_cache_id if the id
space is full.

The locking is implemented in a manner similar to lruvecs, i.e.  we have
one lock per node that protects all lists (both global and per cgroup) on
the node.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/list_lru.h   |  52 +++++--
 include/linux/memcontrol.h |  14 ++
 mm/list_lru.c              | 374 ++++++++++++++++++++++++++++++++++++++++++---
 mm/memcontrol.c            |  20 +++
 4 files changed, 424 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index ee9486ac0621..305b598abac2 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -11,6 +11,8 @@
 #include <linux/nodemask.h>
 #include <linux/shrinker.h>
 
+struct mem_cgroup;
+
 /* list_lru_walk_cb has to always return one of those */
 enum lru_status {
 	LRU_REMOVED,		/* item removed from list */
@@ -22,11 +24,26 @@ enum lru_status {
 				   internally, but has to return locked. */
 };
 
-struct list_lru_node {
-	spinlock_t		lock;
+struct list_lru_one {
 	struct list_head	list;
 	/* kept as signed so we can catch imbalance bugs */
 	long			nr_items;
+};
+
+struct list_lru_memcg {
+	/* array of per cgroup lists, indexed by memcg_cache_id */
+	struct list_lru_one	*lru[0];
+};
+
+struct list_lru_node {
+	/* protects all lists on the node, including per cgroup */
+	spinlock_t		lock;
+	/* global list, used for the root cgroup in cgroup aware lrus */
+	struct list_lru_one	lru;
+#ifdef CONFIG_MEMCG_KMEM
+	/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
+	struct list_lru_memcg	*memcg_lrus;
+#endif
 } ____cacheline_aligned_in_smp;
 
 struct list_lru {
@@ -37,11 +54,14 @@ struct list_lru {
 };
 
 void list_lru_destroy(struct list_lru *lru);
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
-static inline int list_lru_init(struct list_lru *lru)
-{
-	return list_lru_init_key(lru, NULL);
-}
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+		    struct lock_class_key *key);
+
+#define list_lru_init(lru)		__list_lru_init((lru), false, NULL)
+#define list_lru_init_key(lru, key)	__list_lru_init((lru), false, (key))
+#define list_lru_init_memcg(lru)	__list_lru_init((lru), true, NULL)
+
+int memcg_update_all_list_lrus(int num_memcgs);
 
 /**
  * list_lru_add: add an element to the lru list's tail
@@ -75,20 +95,23 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
 bool list_lru_del(struct list_lru *lru, struct list_head *item);
 
 /**
- * list_lru_count_node: return the number of objects currently held by @lru
+ * list_lru_count_one: return the number of objects currently held by @lru
  * @lru: the lru pointer.
  * @nid: the node id to count from.
+ * @memcg: the cgroup to count from.
  *
  * Always return a non-negative number, 0 for empty lists. There is no
  * guarantee that the list is not updated while the count is being computed.
  * Callers that want such a guarantee need to provide an outer lock.
  */
+unsigned long list_lru_count_one(struct list_lru *lru,
+				 int nid, struct mem_cgroup *memcg);
 unsigned long list_lru_count_node(struct list_lru *lru, int nid);
 
 static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
 						  struct shrink_control *sc)
 {
-	return list_lru_count_node(lru, sc->nid);
+	return list_lru_count_one(lru, sc->nid, sc->memcg);
 }
 
 static inline unsigned long list_lru_count(struct list_lru *lru)
@@ -105,9 +128,10 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
 typedef enum lru_status
 (*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
 /**
- * list_lru_walk_node: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items.
  * @lru: the lru pointer.
  * @nid: the node id to scan from.
+ * @memcg: the cgroup to scan from.
  * @isolate: callback function that is resposible for deciding what to do with
  *  the item currently being scanned
  * @cb_arg: opaque type that will be passed to @isolate
@@ -125,6 +149,10 @@ typedef enum lru_status
  *
  * Return value: the number of objects effectively removed from the LRU.
  */
+unsigned long list_lru_walk_one(struct list_lru *lru,
+				int nid, struct mem_cgroup *memcg,
+				list_lru_walk_cb isolate, void *cb_arg,
+				unsigned long *nr_to_walk);
 unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
 				 list_lru_walk_cb isolate, void *cb_arg,
 				 unsigned long *nr_to_walk);
@@ -133,8 +161,8 @@ static inline unsigned long
 list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
 		     list_lru_walk_cb isolate, void *cb_arg)
 {
-	return list_lru_walk_node(lru, sc->nid, isolate, cb_arg,
-				  &sc->nr_to_scan);
+	return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg,
+				 &sc->nr_to_scan);
 }
 
 static inline unsigned long
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index dbc4baa3619c..72dff5fb0d0c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -439,6 +439,8 @@ int memcg_cache_id(struct mem_cgroup *memcg);
 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
 void __memcg_kmem_put_cache(struct kmem_cache *cachep);
 
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr);
+
 int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
 		      unsigned long nr_pages);
 void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
@@ -535,6 +537,13 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
 	if (memcg_kmem_enabled())
 		__memcg_kmem_put_cache(cachep);
 }
+
+static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+	if (!memcg_kmem_enabled())
+		return NULL;
+	return __mem_cgroup_from_kmem(ptr);
+}
 #else
 #define for_each_memcg_cache_index(_idx)	\
 	for (; NULL; )
@@ -586,6 +595,11 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
 {
 }
+
+static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+	return NULL;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 #endif /* _LINUX_MEMCONTROL_H */
 
diff --git a/mm/list_lru.c b/mm/list_lru.c
index a9021cb3ccde..79aee70c3b9d 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -10,6 +10,7 @@
 #include <linux/list_lru.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
+#include <linux/memcontrol.h>
 
 #ifdef CONFIG_MEMCG_KMEM
 static LIST_HEAD(list_lrus);
@@ -38,16 +39,71 @@ static void list_lru_unregister(struct list_lru *lru)
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+	return !!lru->node[0].memcg_lrus;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+	/*
+	 * The lock protects the array of per cgroup lists from relocation
+	 * (see memcg_update_list_lru_node).
+	 */
+	lockdep_assert_held(&nlru->lock);
+	if (nlru->memcg_lrus && idx >= 0)
+		return nlru->memcg_lrus->lru[idx];
+
+	return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+	struct mem_cgroup *memcg;
+
+	if (!nlru->memcg_lrus)
+		return &nlru->lru;
+
+	memcg = mem_cgroup_from_kmem(ptr);
+	if (!memcg)
+		return &nlru->lru;
+
+	return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+}
+#else
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+	return false;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+	return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+	return &nlru->lru;
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
 bool list_lru_add(struct list_lru *lru, struct list_head *item)
 {
 	int nid = page_to_nid(virt_to_page(item));
 	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
 
 	spin_lock(&nlru->lock);
-	WARN_ON_ONCE(nlru->nr_items < 0);
+	l = list_lru_from_kmem(nlru, item);
+	WARN_ON_ONCE(l->nr_items < 0);
 	if (list_empty(item)) {
-		list_add_tail(item, &nlru->list);
-		nlru->nr_items++;
+		list_add_tail(item, &l->list);
+		l->nr_items++;
 		spin_unlock(&nlru->lock);
 		return true;
 	}
@@ -60,12 +116,14 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 {
 	int nid = page_to_nid(virt_to_page(item));
 	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
 
 	spin_lock(&nlru->lock);
+	l = list_lru_from_kmem(nlru, item);
 	if (!list_empty(item)) {
 		list_del_init(item);
-		nlru->nr_items--;
-		WARN_ON_ONCE(nlru->nr_items < 0);
+		l->nr_items--;
+		WARN_ON_ONCE(l->nr_items < 0);
 		spin_unlock(&nlru->lock);
 		return true;
 	}
@@ -74,33 +132,58 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 }
 EXPORT_SYMBOL_GPL(list_lru_del);
 
-unsigned long
-list_lru_count_node(struct list_lru *lru, int nid)
+static unsigned long __list_lru_count_one(struct list_lru *lru,
+					  int nid, int memcg_idx)
 {
-	unsigned long count = 0;
 	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
+	unsigned long count;
 
 	spin_lock(&nlru->lock);
-	WARN_ON_ONCE(nlru->nr_items < 0);
-	count += nlru->nr_items;
+	l = list_lru_from_memcg_idx(nlru, memcg_idx);
+	WARN_ON_ONCE(l->nr_items < 0);
+	count = l->nr_items;
 	spin_unlock(&nlru->lock);
 
 	return count;
 }
+
+unsigned long list_lru_count_one(struct list_lru *lru,
+				 int nid, struct mem_cgroup *memcg)
+{
+	return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
+}
+EXPORT_SYMBOL_GPL(list_lru_count_one);
+
+unsigned long list_lru_count_node(struct list_lru *lru, int nid)
+{
+	long count = 0;
+	int memcg_idx;
+
+	count += __list_lru_count_one(lru, nid, -1);
+	if (list_lru_memcg_aware(lru)) {
+		for_each_memcg_cache_index(memcg_idx)
+			count += __list_lru_count_one(lru, nid, memcg_idx);
+	}
+	return count;
+}
 EXPORT_SYMBOL_GPL(list_lru_count_node);
 
-unsigned long
-list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
-		   void *cb_arg, unsigned long *nr_to_walk)
+static unsigned long
+__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+		    list_lru_walk_cb isolate, void *cb_arg,
+		    unsigned long *nr_to_walk)
 {
 
-	struct list_lru_node	*nlru = &lru->node[nid];
+	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
 	struct list_head *item, *n;
 	unsigned long isolated = 0;
 
 	spin_lock(&nlru->lock);
+	l = list_lru_from_memcg_idx(nlru, memcg_idx);
 restart:
-	list_for_each_safe(item, n, &nlru->list) {
+	list_for_each_safe(item, n, &l->list) {
 		enum lru_status ret;
 
 		/*
@@ -116,8 +199,8 @@ restart:
 		case LRU_REMOVED_RETRY:
 			assert_spin_locked(&nlru->lock);
 		case LRU_REMOVED:
-			nlru->nr_items--;
-			WARN_ON_ONCE(nlru->nr_items < 0);
+			l->nr_items--;
+			WARN_ON_ONCE(l->nr_items < 0);
 			isolated++;
 			/*
 			 * If the lru lock has been dropped, our list
@@ -128,7 +211,7 @@ restart:
 				goto restart;
 			break;
 		case LRU_ROTATE:
-			list_move_tail(item, &nlru->list);
+			list_move_tail(item, &l->list);
 			break;
 		case LRU_SKIP:
 			break;
@@ -147,36 +230,279 @@ restart:
 	spin_unlock(&nlru->lock);
 	return isolated;
 }
+
+unsigned long
+list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+		  list_lru_walk_cb isolate, void *cb_arg,
+		  unsigned long *nr_to_walk)
+{
+	return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
+				   isolate, cb_arg, nr_to_walk);
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_one);
+
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+				 list_lru_walk_cb isolate, void *cb_arg,
+				 unsigned long *nr_to_walk)
+{
+	long isolated = 0;
+	int memcg_idx;
+
+	isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
+					nr_to_walk);
+	if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
+		for_each_memcg_cache_index(memcg_idx) {
+			isolated += __list_lru_walk_one(lru, nid, memcg_idx,
+						isolate, cb_arg, nr_to_walk);
+			if (*nr_to_walk <= 0)
+				break;
+		}
+	}
+	return isolated;
+}
 EXPORT_SYMBOL_GPL(list_lru_walk_node);
 
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
+static void init_one_lru(struct list_lru_one *l)
+{
+	INIT_LIST_HEAD(&l->list);
+	l->nr_items = 0;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
+					  int begin, int end)
+{
+	int i;
+
+	for (i = begin; i < end; i++)
+		kfree(memcg_lrus->lru[i]);
+}
+
+static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
+				      int begin, int end)
+{
+	int i;
+
+	for (i = begin; i < end; i++) {
+		struct list_lru_one *l;
+
+		l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
+		if (!l)
+			goto fail;
+
+		init_one_lru(l);
+		memcg_lrus->lru[i] = l;
+	}
+	return 0;
+fail:
+	__memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
+	return -ENOMEM;
+}
+
+static int memcg_init_list_lru_node(struct list_lru_node *nlru)
+{
+	int size = memcg_nr_cache_ids;
+
+	nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
+	if (!nlru->memcg_lrus)
+		return -ENOMEM;
+
+	if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
+		kfree(nlru->memcg_lrus);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
+{
+	__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
+	kfree(nlru->memcg_lrus);
+}
+
+static int memcg_update_list_lru_node(struct list_lru_node *nlru,
+				      int old_size, int new_size)
+{
+	struct list_lru_memcg *old, *new;
+
+	BUG_ON(old_size > new_size);
+
+	old = nlru->memcg_lrus;
+	new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	if (__memcg_init_list_lru_node(new, old_size, new_size)) {
+		kfree(new);
+		return -ENOMEM;
+	}
+
+	memcpy(new, old, old_size * sizeof(void *));
+
+	/*
+	 * The lock guarantees that we won't race with a reader
+	 * (see list_lru_from_memcg_idx).
+	 *
+	 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+	 * we have to use IRQ-safe primitives here to avoid deadlock.
+	 */
+	spin_lock_irq(&nlru->lock);
+	nlru->memcg_lrus = new;
+	spin_unlock_irq(&nlru->lock);
+
+	kfree(old);
+	return 0;
+}
+
+static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
+					      int old_size, int new_size)
+{
+	/* do not bother shrinking the array back to the old size, because we
+	 * cannot handle allocation failures here */
+	__memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
+}
+
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+	int i;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		if (!memcg_aware)
+			lru->node[i].memcg_lrus = NULL;
+		else if (memcg_init_list_lru_node(&lru->node[i]))
+			goto fail;
+	}
+	return 0;
+fail:
+	for (i = i - 1; i >= 0; i--)
+		memcg_destroy_list_lru_node(&lru->node[i]);
+	return -ENOMEM;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		memcg_destroy_list_lru_node(&lru->node[i]);
+}
+
+static int memcg_update_list_lru(struct list_lru *lru,
+				 int old_size, int new_size)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return 0;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		if (memcg_update_list_lru_node(&lru->node[i],
+					       old_size, new_size))
+			goto fail;
+	}
+	return 0;
+fail:
+	for (i = i - 1; i >= 0; i--)
+		memcg_cancel_update_list_lru_node(&lru->node[i],
+						  old_size, new_size);
+	return -ENOMEM;
+}
+
+static void memcg_cancel_update_list_lru(struct list_lru *lru,
+					 int old_size, int new_size)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		memcg_cancel_update_list_lru_node(&lru->node[i],
+						  old_size, new_size);
+}
+
+int memcg_update_all_list_lrus(int new_size)
+{
+	int ret = 0;
+	struct list_lru *lru;
+	int old_size = memcg_nr_cache_ids;
+
+	mutex_lock(&list_lrus_mutex);
+	list_for_each_entry(lru, &list_lrus, list) {
+		ret = memcg_update_list_lru(lru, old_size, new_size);
+		if (ret)
+			goto fail;
+	}
+out:
+	mutex_unlock(&list_lrus_mutex);
+	return ret;
+fail:
+	list_for_each_entry_continue_reverse(lru, &list_lrus, list)
+		memcg_cancel_update_list_lru(lru, old_size, new_size);
+	goto out;
+}
+#else
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+	return 0;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+		    struct lock_class_key *key)
 {
 	int i;
 	size_t size = sizeof(*lru->node) * nr_node_ids;
+	int err = -ENOMEM;
+
+	memcg_get_cache_ids();
 
 	lru->node = kzalloc(size, GFP_KERNEL);
 	if (!lru->node)
-		return -ENOMEM;
+		goto out;
 
 	for (i = 0; i < nr_node_ids; i++) {
 		spin_lock_init(&lru->node[i].lock);
 		if (key)
 			lockdep_set_class(&lru->node[i].lock, key);
-		INIT_LIST_HEAD(&lru->node[i].list);
-		lru->node[i].nr_items = 0;
+		init_one_lru(&lru->node[i].lru);
+	}
+
+	err = memcg_init_list_lru(lru, memcg_aware);
+	if (err) {
+		kfree(lru->node);
+		goto out;
 	}
+
 	list_lru_register(lru);
-	return 0;
+out:
+	memcg_put_cache_ids();
+	return err;
 }
-EXPORT_SYMBOL_GPL(list_lru_init_key);
+EXPORT_SYMBOL_GPL(__list_lru_init);
 
 void list_lru_destroy(struct list_lru *lru)
 {
 	/* Already destroyed or not yet initialized? */
 	if (!lru->node)
 		return;
+
+	memcg_get_cache_ids();
+
 	list_lru_unregister(lru);
+
+	memcg_destroy_list_lru(lru);
 	kfree(lru->node);
 	lru->node = NULL;
+
+	memcg_put_cache_ids();
 }
 EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6706e5fa5ac0..afa55bb38cbd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2571,6 +2571,8 @@ static int memcg_alloc_cache_id(void)
 		size = MEMCG_CACHES_MAX_SIZE;
 
 	err = memcg_update_all_caches(size);
+	if (!err)
+		err = memcg_update_all_list_lrus(size);
 	if (!err)
 		memcg_nr_cache_ids = size;
 
@@ -2765,6 +2767,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
 	memcg_uncharge_kmem(memcg, 1 << order);
 	page->mem_cgroup = NULL;
 }
+
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
+{
+	struct mem_cgroup *memcg = NULL;
+	struct kmem_cache *cachep;
+	struct page *page;
+
+	page = virt_to_head_page(ptr);
+	if (PageSlab(page)) {
+		cachep = page->slab_cache;
+		if (!is_root_cache(cachep))
+			memcg = cachep->memcg_params->memcg;
+	} else
+		/* page allocated by alloc_kmem_pages */
+		memcg = page->mem_cgroup;
+
+	return memcg;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-- 
cgit v1.2.3


From f7ce3190c4a35bf887adb7a1aa1ba899b679872d Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:20 -0800
Subject: slab: embed memcg_cache_params to kmem_cache

Currently, kmem_cache stores a pointer to struct memcg_cache_params
instead of embedding it.  The rationale is to save memory when kmem
accounting is disabled.  However, the memcg_cache_params has shrivelled
drastically since it was first introduced:

* Initially:

struct memcg_cache_params {
	bool is_root_cache;
	union {
		struct kmem_cache *memcg_caches[0];
		struct {
			struct mem_cgroup *memcg;
			struct list_head list;
			struct kmem_cache *root_cache;
			bool dead;
			atomic_t nr_pages;
			struct work_struct destroy;
		};
	};
};

* Now:

struct memcg_cache_params {
	bool is_root_cache;
	union {
		struct {
			struct rcu_head rcu_head;
			struct kmem_cache *memcg_caches[0];
		};
		struct {
			struct mem_cgroup *memcg;
			struct kmem_cache *root_cache;
		};
	};
};

So the memory saving does not seem to be a clear win anymore.

OTOH, keeping a pointer to memcg_cache_params struct instead of embedding
it results in touching one more cache line on kmem alloc/free hot paths.
Besides, it makes linking kmem caches in a list chained by a field of
struct memcg_cache_params really painful due to a level of indirection,
while I want to make them linked in the following patch.  That said, let
us embed it.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h     |  17 +++----
 include/linux/slab_def.h |   2 +-
 include/linux/slub_def.h |   2 +-
 mm/memcontrol.c          |  11 ++--
 mm/slab.h                |  48 +++++++++---------
 mm/slab_common.c         | 129 ++++++++++++++++++++++++++---------------------
 mm/slub.c                |   5 +-
 7 files changed, 111 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 2e3b448cfa2d..1e03c11bbfbd 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -473,14 +473,14 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 #ifndef ARCH_SLAB_MINALIGN
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 #endif
+
+struct memcg_cache_array {
+	struct rcu_head rcu;
+	struct kmem_cache *entries[0];
+};
+
 /*
  * This is the main placeholder for memcg-related information in kmem caches.
- * struct kmem_cache will hold a pointer to it, so the memory cost while
- * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it
- * would otherwise be if that would be bundled in kmem_cache: we'll need an
- * extra pointer chase. But the trade off clearly lays in favor of not
- * penalizing non-users.
- *
  * Both the root cache and the child caches will have it. For the root cache,
  * this will hold a dynamically allocated array large enough to hold
  * information about the currently limited memcgs in the system. To allow the
@@ -495,10 +495,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 struct memcg_cache_params {
 	bool is_root_cache;
 	union {
-		struct {
-			struct rcu_head rcu_head;
-			struct kmem_cache *memcg_caches[0];
-		};
+		struct memcg_cache_array __rcu *memcg_caches;
 		struct {
 			struct mem_cgroup *memcg;
 			struct kmem_cache *root_cache;
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index b869d1662ba3..33d049066c3d 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -70,7 +70,7 @@ struct kmem_cache {
 	int obj_offset;
 #endif /* CONFIG_DEBUG_SLAB */
 #ifdef CONFIG_MEMCG_KMEM
-	struct memcg_cache_params *memcg_params;
+	struct memcg_cache_params memcg_params;
 #endif
 
 	struct kmem_cache_node *node[MAX_NUMNODES];
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index d82abd40a3c0..9abf04ed0999 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -85,7 +85,7 @@ struct kmem_cache {
 	struct kobject kobj;	/* For sysfs */
 #endif
 #ifdef CONFIG_MEMCG_KMEM
-	struct memcg_cache_params *memcg_params;
+	struct memcg_cache_params memcg_params;
 	int max_attr_size; /* for propagation, maximum size of a stored attr */
 #ifdef CONFIG_SYSFS
 	struct kset *memcg_kset;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index afa55bb38cbd..6f3c0fcd7a2d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -332,7 +332,7 @@ struct mem_cgroup {
 	struct cg_proto tcp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
-        /* Index in the kmem_cache->memcg_params->memcg_caches array */
+        /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
 #endif
 
@@ -531,7 +531,7 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
 
 #ifdef CONFIG_MEMCG_KMEM
 /*
- * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
  * The main reason for not using cgroup id for this:
  *  this works better in sparse environments, where we have a lot of memcgs,
  *  but only a few kmem-limited. Or also, if we have, for instance, 200
@@ -2667,8 +2667,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
 	struct mem_cgroup *memcg;
 	struct kmem_cache *memcg_cachep;
 
-	VM_BUG_ON(!cachep->memcg_params);
-	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+	VM_BUG_ON(!is_root_cache(cachep));
 
 	if (current->memcg_kmem_skip_account)
 		return cachep;
@@ -2702,7 +2701,7 @@ out:
 void __memcg_kmem_put_cache(struct kmem_cache *cachep)
 {
 	if (!is_root_cache(cachep))
-		css_put(&cachep->memcg_params->memcg->css);
+		css_put(&cachep->memcg_params.memcg->css);
 }
 
 /*
@@ -2778,7 +2777,7 @@ struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
 	if (PageSlab(page)) {
 		cachep = page->slab_cache;
 		if (!is_root_cache(cachep))
-			memcg = cachep->memcg_params->memcg;
+			memcg = cachep->memcg_params.memcg;
 	} else
 		/* page allocated by alloc_kmem_pages */
 		memcg = page->mem_cgroup;
diff --git a/mm/slab.h b/mm/slab.h
index 90430d6f665e..53a623f85931 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
 extern void create_boot_cache(struct kmem_cache *, const char *name,
 			size_t size, unsigned long flags);
 
-struct mem_cgroup;
-
 int slab_unmergeable(struct kmem_cache *s);
 struct kmem_cache *find_mergeable(size_t size, size_t align,
 		unsigned long flags, const char *name, void (*ctor)(void *));
@@ -167,14 +165,13 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 #ifdef CONFIG_MEMCG_KMEM
 static inline bool is_root_cache(struct kmem_cache *s)
 {
-	return !s->memcg_params || s->memcg_params->is_root_cache;
+	return s->memcg_params.is_root_cache;
 }
 
 static inline bool slab_equal_or_root(struct kmem_cache *s,
-					struct kmem_cache *p)
+				      struct kmem_cache *p)
 {
-	return (p == s) ||
-		(s->memcg_params && (p == s->memcg_params->root_cache));
+	return p == s || p == s->memcg_params.root_cache;
 }
 
 /*
@@ -185,37 +182,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s,
 static inline const char *cache_name(struct kmem_cache *s)
 {
 	if (!is_root_cache(s))
-		return s->memcg_params->root_cache->name;
+		s = s->memcg_params.root_cache;
 	return s->name;
 }
 
 /*
  * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
- * That said the caller must assure the memcg's cache won't go away. Since once
- * created a memcg's cache is destroyed only along with the root cache, it is
- * true if we are going to allocate from the cache or hold a reference to the
- * root cache by other means. Otherwise, we should hold either the slab_mutex
- * or the memcg's slab_caches_mutex while calling this function and accessing
- * the returned value.
+ * That said the caller must assure the memcg's cache won't go away by either
+ * taking a css reference to the owner cgroup, or holding the slab_mutex.
  */
 static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
 	struct kmem_cache *cachep;
-	struct memcg_cache_params *params;
-
-	if (!s->memcg_params)
-		return NULL;
+	struct memcg_cache_array *arr;
 
 	rcu_read_lock();
-	params = rcu_dereference(s->memcg_params);
+	arr = rcu_dereference(s->memcg_params.memcg_caches);
 
 	/*
 	 * Make sure we will access the up-to-date value. The code updating
 	 * memcg_caches issues a write barrier to match this (see
-	 * memcg_register_cache()).
+	 * memcg_create_kmem_cache()).
 	 */
-	cachep = lockless_dereference(params->memcg_caches[idx]);
+	cachep = lockless_dereference(arr->entries[idx]);
 	rcu_read_unlock();
 
 	return cachep;
@@ -225,7 +215,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 {
 	if (is_root_cache(s))
 		return s;
-	return s->memcg_params->root_cache;
+	return s->memcg_params.root_cache;
 }
 
 static __always_inline int memcg_charge_slab(struct kmem_cache *s,
@@ -235,7 +225,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
 		return 0;
 	if (is_root_cache(s))
 		return 0;
-	return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order);
+	return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order);
 }
 
 static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -244,9 +234,13 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
 		return;
 	if (is_root_cache(s))
 		return;
-	memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order);
+	memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
 }
-#else
+
+extern void slab_init_memcg_params(struct kmem_cache *);
+
+#else /* !CONFIG_MEMCG_KMEM */
+
 static inline bool is_root_cache(struct kmem_cache *s)
 {
 	return true;
@@ -282,7 +276,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
 static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
 {
 }
-#endif
+
+static inline void slab_init_memcg_params(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 
 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 23f5fcde6043..7cc32cf126ef 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -106,62 +106,66 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
 #endif
 
 #ifdef CONFIG_MEMCG_KMEM
-static int memcg_alloc_cache_params(struct mem_cgroup *memcg,
-		struct kmem_cache *s, struct kmem_cache *root_cache)
+void slab_init_memcg_params(struct kmem_cache *s)
 {
-	size_t size;
+	s->memcg_params.is_root_cache = true;
+	RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
+}
+
+static int init_memcg_params(struct kmem_cache *s,
+		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+	struct memcg_cache_array *arr;
 
-	if (!memcg_kmem_enabled())
+	if (memcg) {
+		s->memcg_params.is_root_cache = false;
+		s->memcg_params.memcg = memcg;
+		s->memcg_params.root_cache = root_cache;
 		return 0;
+	}
 
-	if (!memcg) {
-		size = offsetof(struct memcg_cache_params, memcg_caches);
-		size += memcg_nr_cache_ids * sizeof(void *);
-	} else
-		size = sizeof(struct memcg_cache_params);
+	slab_init_memcg_params(s);
 
-	s->memcg_params = kzalloc(size, GFP_KERNEL);
-	if (!s->memcg_params)
-		return -ENOMEM;
+	if (!memcg_nr_cache_ids)
+		return 0;
 
-	if (memcg) {
-		s->memcg_params->memcg = memcg;
-		s->memcg_params->root_cache = root_cache;
-	} else
-		s->memcg_params->is_root_cache = true;
+	arr = kzalloc(sizeof(struct memcg_cache_array) +
+		      memcg_nr_cache_ids * sizeof(void *),
+		      GFP_KERNEL);
+	if (!arr)
+		return -ENOMEM;
 
+	RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
 	return 0;
 }
 
-static void memcg_free_cache_params(struct kmem_cache *s)
+static void destroy_memcg_params(struct kmem_cache *s)
 {
-	kfree(s->memcg_params);
+	if (is_root_cache(s))
+		kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
 }
 
-static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs)
+static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 {
-	int size;
-	struct memcg_cache_params *new_params, *cur_params;
+	struct memcg_cache_array *old, *new;
 
-	BUG_ON(!is_root_cache(s));
-
-	size = offsetof(struct memcg_cache_params, memcg_caches);
-	size += num_memcgs * sizeof(void *);
+	if (!is_root_cache(s))
+		return 0;
 
-	new_params = kzalloc(size, GFP_KERNEL);
-	if (!new_params)
+	new = kzalloc(sizeof(struct memcg_cache_array) +
+		      new_array_size * sizeof(void *), GFP_KERNEL);
+	if (!new)
 		return -ENOMEM;
 
-	cur_params = s->memcg_params;
-	memcpy(new_params->memcg_caches, cur_params->memcg_caches,
-	       memcg_nr_cache_ids * sizeof(void *));
-
-	new_params->is_root_cache = true;
-
-	rcu_assign_pointer(s->memcg_params, new_params);
-	if (cur_params)
-		kfree_rcu(cur_params, rcu_head);
+	old = rcu_dereference_protected(s->memcg_params.memcg_caches,
+					lockdep_is_held(&slab_mutex));
+	if (old)
+		memcpy(new->entries, old->entries,
+		       memcg_nr_cache_ids * sizeof(void *));
 
+	rcu_assign_pointer(s->memcg_params.memcg_caches, new);
+	if (old)
+		kfree_rcu(old, rcu);
 	return 0;
 }
 
@@ -172,10 +176,7 @@ int memcg_update_all_caches(int num_memcgs)
 
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list) {
-		if (!is_root_cache(s))
-			continue;
-
-		ret = memcg_update_cache_params(s, num_memcgs);
+		ret = update_memcg_params(s, num_memcgs);
 		/*
 		 * Instead of freeing the memory, we'll just leave the caches
 		 * up to this point in an updated state.
@@ -187,13 +188,13 @@ int memcg_update_all_caches(int num_memcgs)
 	return ret;
 }
 #else
-static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
-		struct kmem_cache *s, struct kmem_cache *root_cache)
+static inline int init_memcg_params(struct kmem_cache *s,
+		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
 {
 	return 0;
 }
 
-static inline void memcg_free_cache_params(struct kmem_cache *s)
+static inline void destroy_memcg_params(struct kmem_cache *s)
 {
 }
 #endif /* CONFIG_MEMCG_KMEM */
@@ -311,7 +312,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
 	s->align = align;
 	s->ctor = ctor;
 
-	err = memcg_alloc_cache_params(memcg, s, root_cache);
+	err = init_memcg_params(s, memcg, root_cache);
 	if (err)
 		goto out_free_cache;
 
@@ -327,7 +328,7 @@ out:
 	return s;
 
 out_free_cache:
-	memcg_free_cache_params(s);
+	destroy_memcg_params(s);
 	kmem_cache_free(kmem_cache, s);
 	goto out;
 }
@@ -439,11 +440,15 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s,
 
 #ifdef CONFIG_MEMCG_KMEM
 	if (!is_root_cache(s)) {
-		struct kmem_cache *root_cache = s->memcg_params->root_cache;
-		int memcg_id = memcg_cache_id(s->memcg_params->memcg);
-
-		BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s);
-		root_cache->memcg_params->memcg_caches[memcg_id] = NULL;
+		int idx;
+		struct memcg_cache_array *arr;
+
+		idx = memcg_cache_id(s->memcg_params.memcg);
+		arr = rcu_dereference_protected(s->memcg_params.root_cache->
+						memcg_params.memcg_caches,
+						lockdep_is_held(&slab_mutex));
+		BUG_ON(arr->entries[idx] != s);
+		arr->entries[idx] = NULL;
 	}
 #endif
 	list_move(&s->list, release);
@@ -481,27 +486,32 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 			     struct kmem_cache *root_cache)
 {
 	static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
-	int memcg_id = memcg_cache_id(memcg);
+	struct memcg_cache_array *arr;
 	struct kmem_cache *s = NULL;
 	char *cache_name;
+	int idx;
 
 	get_online_cpus();
 	get_online_mems();
 
 	mutex_lock(&slab_mutex);
 
+	idx = memcg_cache_id(memcg);
+	arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
+					lockdep_is_held(&slab_mutex));
+
 	/*
 	 * Since per-memcg caches are created asynchronously on first
 	 * allocation (see memcg_kmem_get_cache()), several threads can try to
 	 * create the same cache, but only one of them may succeed.
 	 */
-	if (cache_from_memcg_idx(root_cache, memcg_id))
+	if (arr->entries[idx])
 		goto out_unlock;
 
 	cgroup_name(mem_cgroup_css(memcg)->cgroup,
 		    memcg_name_buf, sizeof(memcg_name_buf));
 	cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
-			       memcg_cache_id(memcg), memcg_name_buf);
+			       idx, memcg_name_buf);
 	if (!cache_name)
 		goto out_unlock;
 
@@ -525,7 +535,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 	 * initialized.
 	 */
 	smp_wmb();
-	root_cache->memcg_params->memcg_caches[memcg_id] = s;
+	arr->entries[idx] = s;
 
 out_unlock:
 	mutex_unlock(&slab_mutex);
@@ -545,7 +555,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 
 	mutex_lock(&slab_mutex);
 	list_for_each_entry_safe(s, s2, &slab_caches, list) {
-		if (is_root_cache(s) || s->memcg_params->memcg != memcg)
+		if (is_root_cache(s) || s->memcg_params.memcg != memcg)
 			continue;
 		/*
 		 * The cgroup is about to be freed and therefore has no charges
@@ -564,7 +574,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 
 void slab_kmem_cache_release(struct kmem_cache *s)
 {
-	memcg_free_cache_params(s);
+	destroy_memcg_params(s);
 	kfree(s->name);
 	kmem_cache_free(kmem_cache, s);
 }
@@ -640,6 +650,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
 	s->name = name;
 	s->size = s->object_size = size;
 	s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+
+	slab_init_memcg_params(s);
+
 	err = __kmem_cache_create(s, flags);
 
 	if (err)
@@ -980,7 +993,7 @@ int memcg_slab_show(struct seq_file *m, void *p)
 
 	if (p == slab_caches.next)
 		print_slabinfo_header(m);
-	if (!is_root_cache(s) && s->memcg_params->memcg == memcg)
+	if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
 		cache_show(s, m);
 	return 0;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 8b8508adf9c2..75d55fdfe3a1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3577,6 +3577,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 			p->slab_cache = s;
 #endif
 	}
+	slab_init_memcg_params(s);
 	list_add(&s->list, &slab_caches);
 	return s;
 }
@@ -4964,7 +4965,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
 	if (is_root_cache(s))
 		return;
 
-	root_cache = s->memcg_params->root_cache;
+	root_cache = s->memcg_params.root_cache;
 
 	/*
 	 * This mean this cache had no attribute written. Therefore, no point
@@ -5044,7 +5045,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
 {
 #ifdef CONFIG_MEMCG_KMEM
 	if (!is_root_cache(s))
-		return s->memcg_params->root_cache->memcg_kset;
+		return s->memcg_params.root_cache->memcg_kset;
 #endif
 	return slab_kset;
 }
-- 
cgit v1.2.3


From 426589f571f7d6d5ab2ca33ece73164149279ca1 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:23 -0800
Subject: slab: link memcg caches of the same kind into a list

Sometimes, we need to iterate over all memcg copies of a particular root
kmem cache.  Currently, we use memcg_cache_params->memcg_caches array for
that, because it contains all existing memcg caches.

However, it's a bad practice to keep all caches, including those that
belong to offline cgroups, in this array, because it will be growing
beyond any bounds then.  I'm going to wipe away dead caches from it to
save space.  To still be able to perform iterations over all memcg caches
of the same kind, let us link them into a list.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h |  4 ++++
 mm/slab.c            | 13 +++++--------
 mm/slab.h            | 17 +++++++++++++++++
 mm/slab_common.c     | 21 ++++++++++-----------
 mm/slub.c            | 19 +++++--------------
 5 files changed, 41 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1e03c11bbfbd..26d99f41b410 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -491,9 +491,13 @@ struct memcg_cache_array {
  *
  * @memcg: pointer to the memcg this cache belongs to
  * @root_cache: pointer to the global, root cache, this cache was derived from
+ *
+ * Both root and child caches of the same kind are linked into a list chained
+ * through @list.
  */
 struct memcg_cache_params {
 	bool is_root_cache;
+	struct list_head list;
 	union {
 		struct memcg_cache_array __rcu *memcg_caches;
 		struct {
diff --git a/mm/slab.c b/mm/slab.c
index 65b5dcb6f671..7894017bc160 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared, gfp_t gfp)
 {
 	int ret;
-	struct kmem_cache *c = NULL;
-	int i = 0;
+	struct kmem_cache *c;
 
 	ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
 
@@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	if ((ret < 0) || !is_root_cache(cachep))
 		return ret;
 
-	VM_BUG_ON(!mutex_is_locked(&slab_mutex));
-	for_each_memcg_cache_index(i) {
-		c = cache_from_memcg_idx(cachep, i);
-		if (c)
-			/* return value determined by the parent cache only */
-			__do_tune_cpucache(c, limit, batchcount, shared, gfp);
+	lockdep_assert_held(&slab_mutex);
+	for_each_memcg_cache(c, cachep) {
+		/* return value determined by the root cache only */
+		__do_tune_cpucache(c, limit, batchcount, shared, gfp);
 	}
 
 	return ret;
diff --git a/mm/slab.h b/mm/slab.h
index 53a623f85931..0a56d76ac0e9 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -163,6 +163,18 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 		       size_t count, loff_t *ppos);
 
 #ifdef CONFIG_MEMCG_KMEM
+/*
+ * Iterate over all memcg caches of the given root cache. The caller must hold
+ * slab_mutex.
+ */
+#define for_each_memcg_cache(iter, root) \
+	list_for_each_entry(iter, &(root)->memcg_params.list, \
+			    memcg_params.list)
+
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+	list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
+				 memcg_params.list)
+
 static inline bool is_root_cache(struct kmem_cache *s)
 {
 	return s->memcg_params.is_root_cache;
@@ -241,6 +253,11 @@ extern void slab_init_memcg_params(struct kmem_cache *);
 
 #else /* !CONFIG_MEMCG_KMEM */
 
+#define for_each_memcg_cache(iter, root) \
+	for ((void)(iter), (void)(root); 0; )
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+	for ((void)(iter), (void)(tmp), (void)(root); 0; )
+
 static inline bool is_root_cache(struct kmem_cache *s)
 {
 	return true;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 7cc32cf126ef..989784bd88be 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -109,6 +109,7 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
 void slab_init_memcg_params(struct kmem_cache *s)
 {
 	s->memcg_params.is_root_cache = true;
+	INIT_LIST_HEAD(&s->memcg_params.list);
 	RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
 }
 
@@ -449,6 +450,7 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s,
 						lockdep_is_held(&slab_mutex));
 		BUG_ON(arr->entries[idx] != s);
 		arr->entries[idx] = NULL;
+		list_del(&s->memcg_params.list);
 	}
 #endif
 	list_move(&s->list, release);
@@ -529,6 +531,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 		goto out_unlock;
 	}
 
+	list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
+
 	/*
 	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
 	 * barrier here to ensure nobody will see the kmem_cache partially
@@ -581,11 +585,13 @@ void slab_kmem_cache_release(struct kmem_cache *s)
 
 void kmem_cache_destroy(struct kmem_cache *s)
 {
-	int i;
+	struct kmem_cache *c, *c2;
 	LIST_HEAD(release);
 	bool need_rcu_barrier = false;
 	bool busy = false;
 
+	BUG_ON(!is_root_cache(s));
+
 	get_online_cpus();
 	get_online_mems();
 
@@ -595,10 +601,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	if (s->refcount)
 		goto out_unlock;
 
-	for_each_memcg_cache_index(i) {
-		struct kmem_cache *c = cache_from_memcg_idx(s, i);
-
-		if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
+	for_each_memcg_cache_safe(c, c2, s) {
+		if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
 			busy = true;
 	}
 
@@ -932,16 +936,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
 {
 	struct kmem_cache *c;
 	struct slabinfo sinfo;
-	int i;
 
 	if (!is_root_cache(s))
 		return;
 
-	for_each_memcg_cache_index(i) {
-		c = cache_from_memcg_idx(s, i);
-		if (!c)
-			continue;
-
+	for_each_memcg_cache(c, s) {
 		memset(&sinfo, 0, sizeof(sinfo));
 		get_slabinfo(c, &sinfo);
 
diff --git a/mm/slub.c b/mm/slub.c
index 75d55fdfe3a1..1e5a4636cb23 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3636,13 +3636,10 @@ struct kmem_cache *
 __kmem_cache_alias(const char *name, size_t size, size_t align,
 		   unsigned long flags, void (*ctor)(void *))
 {
-	struct kmem_cache *s;
+	struct kmem_cache *s, *c;
 
 	s = find_mergeable(size, align, flags, name, ctor);
 	if (s) {
-		int i;
-		struct kmem_cache *c;
-
 		s->refcount++;
 
 		/*
@@ -3652,10 +3649,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
 		s->object_size = max(s->object_size, (int)size);
 		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
 
-		for_each_memcg_cache_index(i) {
-			c = cache_from_memcg_idx(s, i);
-			if (!c)
-				continue;
+		for_each_memcg_cache(c, s) {
 			c->object_size = s->object_size;
 			c->inuse = max_t(int, c->inuse,
 					 ALIGN(size, sizeof(void *)));
@@ -4921,7 +4915,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
 	err = attribute->store(s, buf, len);
 #ifdef CONFIG_MEMCG_KMEM
 	if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
-		int i;
+		struct kmem_cache *c;
 
 		mutex_lock(&slab_mutex);
 		if (s->max_attr_size < len)
@@ -4944,11 +4938,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
 		 * directly either failed or succeeded, in which case we loop
 		 * through the descendants with best-effort propagation.
 		 */
-		for_each_memcg_cache_index(i) {
-			struct kmem_cache *c = cache_from_memcg_idx(s, i);
-			if (c)
-				attribute->store(c, buf, len);
-		}
+		for_each_memcg_cache(c, s)
+			attribute->store(c, buf, len);
 		mutex_unlock(&slab_mutex);
 	}
 #endif
-- 
cgit v1.2.3


From 2a4db7eb9391a544ff58f4fa11d35246e87c87af Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:32 -0800
Subject: memcg: free memcg_caches slot on css offline

We need to look up a kmem_cache in ->memcg_params.memcg_caches arrays only
on allocations, so there is no need to have the array entries set until
css free - we can clear them on css offline.  This will allow us to reuse
array entries more efficiently and avoid costly array relocations.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 10 +++++-----
 mm/memcontrol.c      | 38 ++++++++++++++++++++++++++++++++------
 mm/slab_common.c     | 39 ++++++++++++++++++++++++++++-----------
 3 files changed, 65 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 26d99f41b410..ed2ffaab59ea 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -115,13 +115,12 @@ int slab_is_available(void);
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
 			void (*)(void *));
-#ifdef CONFIG_MEMCG_KMEM
-void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
-void memcg_destroy_kmem_caches(struct mem_cgroup *);
-#endif
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
-void kmem_cache_free(struct kmem_cache *, void *);
+
+void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
+void memcg_deactivate_kmem_caches(struct mem_cgroup *);
+void memcg_destroy_kmem_caches(struct mem_cgroup *);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
@@ -288,6 +287,7 @@ static __always_inline int kmalloc_index(size_t size)
 
 void *__kmalloc(size_t size, gfp_t flags);
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
+void kmem_cache_free(struct kmem_cache *, void *);
 
 #ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6f3c0fcd7a2d..abfe0135bfdc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -334,6 +334,7 @@ struct mem_cgroup {
 #if defined(CONFIG_MEMCG_KMEM)
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
+	bool kmem_acct_active;
 #endif
 
 	int last_scanned_node;
@@ -354,7 +355,7 @@ struct mem_cgroup {
 #ifdef CONFIG_MEMCG_KMEM
 bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
-	return memcg->kmemcg_id >= 0;
+	return memcg->kmem_acct_active;
 }
 #endif
 
@@ -585,7 +586,7 @@ static void memcg_free_cache_id(int id);
 
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
-	if (memcg_kmem_is_active(memcg)) {
+	if (memcg->kmemcg_id >= 0) {
 		static_key_slow_dec(&memcg_kmem_enabled_key);
 		memcg_free_cache_id(memcg->kmemcg_id);
 	}
@@ -2666,6 +2667,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
 {
 	struct mem_cgroup *memcg;
 	struct kmem_cache *memcg_cachep;
+	int kmemcg_id;
 
 	VM_BUG_ON(!is_root_cache(cachep));
 
@@ -2673,10 +2675,11 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
 		return cachep;
 
 	memcg = get_mem_cgroup_from_mm(current->mm);
-	if (!memcg_kmem_is_active(memcg))
+	kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+	if (kmemcg_id < 0)
 		goto out;
 
-	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+	memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
 	if (likely(memcg_cachep))
 		return memcg_cachep;
 
@@ -3318,8 +3321,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
 	int err = 0;
 	int memcg_id;
 
-	if (memcg_kmem_is_active(memcg))
-		return 0;
+	BUG_ON(memcg->kmemcg_id >= 0);
+	BUG_ON(memcg->kmem_acct_active);
 
 	/*
 	 * For simplicity, we won't allow this to be disabled.  It also can't
@@ -3362,6 +3365,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
 	 * patched.
 	 */
 	memcg->kmemcg_id = memcg_id;
+	memcg->kmem_acct_active = true;
 out:
 	return err;
 }
@@ -4041,6 +4045,22 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	return mem_cgroup_sockets_init(memcg, ss);
 }
 
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+	if (!memcg->kmem_acct_active)
+		return;
+
+	/*
+	 * Clear the 'active' flag before clearing memcg_caches arrays entries.
+	 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
+	 * guarantees no cache will be created for this cgroup after we are
+	 * done (see memcg_create_kmem_cache()).
+	 */
+	memcg->kmem_acct_active = false;
+
+	memcg_deactivate_kmem_caches(memcg);
+}
+
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 	memcg_destroy_kmem_caches(memcg);
@@ -4052,6 +4072,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	return 0;
 }
 
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+}
+
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 }
@@ -4608,6 +4632,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	spin_unlock(&memcg->event_list_lock);
 
 	vmpressure_cleanup(&memcg->vmpressure);
+
+	memcg_deactivate_kmem(memcg);
 }
 
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6087b1f9a385..0873bcc61c7a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -440,18 +440,8 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s,
 		*need_rcu_barrier = true;
 
 #ifdef CONFIG_MEMCG_KMEM
-	if (!is_root_cache(s)) {
-		int idx;
-		struct memcg_cache_array *arr;
-
-		idx = memcg_cache_id(s->memcg_params.memcg);
-		arr = rcu_dereference_protected(s->memcg_params.root_cache->
-						memcg_params.memcg_caches,
-						lockdep_is_held(&slab_mutex));
-		BUG_ON(arr->entries[idx] != s);
-		arr->entries[idx] = NULL;
+	if (!is_root_cache(s))
 		list_del(&s->memcg_params.list);
-	}
 #endif
 	list_move(&s->list, release);
 	return 0;
@@ -499,6 +489,13 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 
 	mutex_lock(&slab_mutex);
 
+	/*
+	 * The memory cgroup could have been deactivated while the cache
+	 * creation work was pending.
+	 */
+	if (!memcg_kmem_is_active(memcg))
+		goto out_unlock;
+
 	idx = memcg_cache_id(memcg);
 	arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
 					lockdep_is_held(&slab_mutex));
@@ -548,6 +545,26 @@ out_unlock:
 	put_online_cpus();
 }
 
+void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
+{
+	int idx;
+	struct memcg_cache_array *arr;
+	struct kmem_cache *s;
+
+	idx = memcg_cache_id(memcg);
+
+	mutex_lock(&slab_mutex);
+	list_for_each_entry(s, &slab_caches, list) {
+		if (!is_root_cache(s))
+			continue;
+
+		arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
+						lockdep_is_held(&slab_mutex));
+		arr->entries[idx] = NULL;
+	}
+	mutex_unlock(&slab_mutex);
+}
+
 void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 {
 	LIST_HEAD(release);
-- 
cgit v1.2.3


From 3f97b163207c67a3b35931494ad3db1de66356f0 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:35 -0800
Subject: list_lru: add helpers to isolate items

Currently, the isolate callback passed to the list_lru_walk family of
functions is supposed to just delete an item from the list upon returning
LRU_REMOVED or LRU_REMOVED_RETRY, while nr_items counter is fixed by
__list_lru_walk_one after the callback returns.  Since the callback is
allowed to drop the lock after removing an item (it has to return
LRU_REMOVED_RETRY then), the nr_items can be less than the actual number
of elements on the list even if we check them under the lock.  This makes
it difficult to move items from one list_lru_one to another, which is
required for per-memcg list_lru reparenting - we can't just splice the
lists, we have to move entries one by one.

This patch therefore introduces helpers that must be used by callback
functions to isolate items instead of raw list_del/list_move.  These are
list_lru_isolate and list_lru_isolate_move.  They not only remove the
entry from the list, but also fix the nr_items counter, making sure
nr_items always reflects the actual number of elements on the list if
checked under the appropriate lock.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c              | 21 +++++++++++----------
 fs/gfs2/quota.c          |  5 +++--
 fs/inode.c               |  8 ++++----
 fs/xfs/xfs_buf.c         |  6 ++++--
 fs/xfs/xfs_qm.c          |  5 +++--
 include/linux/list_lru.h |  9 +++++++--
 mm/list_lru.c            | 19 ++++++++++++++++---
 mm/workingset.c          |  3 ++-
 8 files changed, 50 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 56c5da89f58a..d04be762b216 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -400,19 +400,20 @@ static void d_shrink_add(struct dentry *dentry, struct list_head *list)
  * LRU lists entirely, while shrink_move moves it to the indicated
  * private list.
  */
-static void d_lru_isolate(struct dentry *dentry)
+static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
 {
 	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
 	dentry->d_flags &= ~DCACHE_LRU_LIST;
 	this_cpu_dec(nr_dentry_unused);
-	list_del_init(&dentry->d_lru);
+	list_lru_isolate(lru, &dentry->d_lru);
 }
 
-static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list)
+static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
+			      struct list_head *list)
 {
 	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
 	dentry->d_flags |= DCACHE_SHRINK_LIST;
-	list_move_tail(&dentry->d_lru, list);
+	list_lru_isolate_move(lru, &dentry->d_lru, list);
 }
 
 /*
@@ -869,8 +870,8 @@ static void shrink_dentry_list(struct list_head *list)
 	}
 }
 
-static enum lru_status
-dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+static enum lru_status dentry_lru_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
 	struct list_head *freeable = arg;
 	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
@@ -890,7 +891,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 	 * another pass through the LRU.
 	 */
 	if (dentry->d_lockref.count) {
-		d_lru_isolate(dentry);
+		d_lru_isolate(lru, dentry);
 		spin_unlock(&dentry->d_lock);
 		return LRU_REMOVED;
 	}
@@ -921,7 +922,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 		return LRU_ROTATE;
 	}
 
-	d_lru_shrink_move(dentry, freeable);
+	d_lru_shrink_move(lru, dentry, freeable);
 	spin_unlock(&dentry->d_lock);
 
 	return LRU_REMOVED;
@@ -951,7 +952,7 @@ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
 }
 
 static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
-						spinlock_t *lru_lock, void *arg)
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
 	struct list_head *freeable = arg;
 	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
@@ -964,7 +965,7 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
 	if (!spin_trylock(&dentry->d_lock))
 		return LRU_SKIP;
 
-	d_lru_shrink_move(dentry, freeable);
+	d_lru_shrink_move(lru, dentry, freeable);
 	spin_unlock(&dentry->d_lock);
 
 	return LRU_REMOVED;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c15d6b216d0b..3aa17d4d1cfc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -145,7 +145,8 @@ static void gfs2_qd_dispose(struct list_head *list)
 }
 
 
-static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg)
+static enum lru_status gfs2_qd_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
 	struct list_head *dispose = arg;
 	struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
@@ -155,7 +156,7 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock,
 
 	if (qd->qd_lockref.count == 0) {
 		lockref_mark_dead(&qd->qd_lockref);
-		list_move(&qd->qd_lru, dispose);
+		list_lru_isolate_move(lru, &qd->qd_lru, dispose);
 	}
 
 	spin_unlock(&qd->qd_lockref.lock);
diff --git a/fs/inode.c b/fs/inode.c
index 524a32c2b0c6..86c612b92c6f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -685,8 +685,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
  * LRU does not have strict ordering. Hence we don't want to reclaim inodes
  * with this flag set because they are the inodes that are out of order.
  */
-static enum lru_status
-inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+static enum lru_status inode_lru_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
 	struct list_head *freeable = arg;
 	struct inode	*inode = container_of(item, struct inode, i_lru);
@@ -704,7 +704,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 	 */
 	if (atomic_read(&inode->i_count) ||
 	    (inode->i_state & ~I_REFERENCED)) {
-		list_del_init(&inode->i_lru);
+		list_lru_isolate(lru, &inode->i_lru);
 		spin_unlock(&inode->i_lock);
 		this_cpu_dec(nr_unused);
 		return LRU_REMOVED;
@@ -738,7 +738,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 
 	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
-	list_move(&inode->i_lru, freeable);
+	list_lru_isolate_move(lru, &inode->i_lru, freeable);
 	spin_unlock(&inode->i_lock);
 
 	this_cpu_dec(nr_unused);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 15c9d224c721..1790b00bea7a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1488,6 +1488,7 @@ xfs_buf_iomove(
 static enum lru_status
 xfs_buftarg_wait_rele(
 	struct list_head	*item,
+	struct list_lru_one	*lru,
 	spinlock_t		*lru_lock,
 	void			*arg)
 
@@ -1509,7 +1510,7 @@ xfs_buftarg_wait_rele(
 	 */
 	atomic_set(&bp->b_lru_ref, 0);
 	bp->b_state |= XFS_BSTATE_DISPOSE;
-	list_move(item, dispose);
+	list_lru_isolate_move(lru, item, dispose);
 	spin_unlock(&bp->b_lock);
 	return LRU_REMOVED;
 }
@@ -1546,6 +1547,7 @@ xfs_wait_buftarg(
 static enum lru_status
 xfs_buftarg_isolate(
 	struct list_head	*item,
+	struct list_lru_one	*lru,
 	spinlock_t		*lru_lock,
 	void			*arg)
 {
@@ -1569,7 +1571,7 @@ xfs_buftarg_isolate(
 	}
 
 	bp->b_state |= XFS_BSTATE_DISPOSE;
-	list_move(item, dispose);
+	list_lru_isolate_move(lru, item, dispose);
 	spin_unlock(&bp->b_lock);
 	return LRU_REMOVED;
 }
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 4f4b1274e144..53cc2aaf8d2b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -430,6 +430,7 @@ struct xfs_qm_isolate {
 static enum lru_status
 xfs_qm_dquot_isolate(
 	struct list_head	*item,
+	struct list_lru_one	*lru,
 	spinlock_t		*lru_lock,
 	void			*arg)
 		__releases(lru_lock) __acquires(lru_lock)
@@ -450,7 +451,7 @@ xfs_qm_dquot_isolate(
 		XFS_STATS_INC(xs_qm_dqwants);
 
 		trace_xfs_dqreclaim_want(dqp);
-		list_del_init(&dqp->q_lru);
+		list_lru_isolate(lru, &dqp->q_lru);
 		XFS_STATS_DEC(xs_qm_dquot_unused);
 		return LRU_REMOVED;
 	}
@@ -494,7 +495,7 @@ xfs_qm_dquot_isolate(
 	xfs_dqunlock(dqp);
 
 	ASSERT(dqp->q_nrefs == 0);
-	list_move_tail(&dqp->q_lru, &isol->dispose);
+	list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
 	XFS_STATS_DEC(xs_qm_dquot_unused);
 	trace_xfs_dqreclaim_done(dqp);
 	XFS_STATS_INC(xs_qm_dqreclaims);
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 305b598abac2..7edf9c9ab9eb 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -125,8 +125,13 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
 	return count;
 }
 
-typedef enum lru_status
-(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+			   struct list_head *head);
+
+typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
+		struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
+
 /**
  * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items.
  * @lru: the lru pointer.
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 79aee70c3b9d..8d9d168c6c38 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -132,6 +132,21 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 }
 EXPORT_SYMBOL_GPL(list_lru_del);
 
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
+{
+	list_del_init(item);
+	list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate);
+
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+			   struct list_head *head)
+{
+	list_move(item, head);
+	list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate_move);
+
 static unsigned long __list_lru_count_one(struct list_lru *lru,
 					  int nid, int memcg_idx)
 {
@@ -194,13 +209,11 @@ restart:
 			break;
 		--*nr_to_walk;
 
-		ret = isolate(item, &nlru->lock, cb_arg);
+		ret = isolate(item, l, &nlru->lock, cb_arg);
 		switch (ret) {
 		case LRU_REMOVED_RETRY:
 			assert_spin_locked(&nlru->lock);
 		case LRU_REMOVED:
-			l->nr_items--;
-			WARN_ON_ONCE(l->nr_items < 0);
 			isolated++;
 			/*
 			 * If the lru lock has been dropped, our list
diff --git a/mm/workingset.c b/mm/workingset.c
index d4fa7fb10a52..aa017133744b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 }
 
 static enum lru_status shadow_lru_isolate(struct list_head *item,
+					  struct list_lru_one *lru,
 					  spinlock_t *lru_lock,
 					  void *arg)
 {
@@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 		goto out;
 	}
 
-	list_del_init(item);
+	list_lru_isolate(lru, item);
 	spin_unlock(lru_lock);
 
 	/*
-- 
cgit v1.2.3


From 2788cf0c401c268b4819c5407493a8769b7007aa Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:38 -0800
Subject: memcg: reparent list_lrus and free kmemcg_id on css offline

Now, the only reason to keep kmemcg_id till css free is list_lru, which
uses it to distribute elements between per-memcg lists.  However, it can
be easily sorted out - we only need to change kmemcg_id of an offline
cgroup to its parent's id, making further list_lru_add()'s add elements to
the parent's list, and then move all elements from the offline cgroup's
list to the one of its parent.  It will work, because a racing
list_lru_del() does not need to know the list it is deleting the element
from.  It can decrement the wrong nr_items counter though, but the ongoing
reparenting will fix it.  After list_lru reparenting is done we are free
to release kmemcg_id saving a valuable slot in a per-memcg array for new
cgroups.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/list_lru.h |  3 ++-
 mm/list_lru.c            | 46 +++++++++++++++++++++++++++++++++++++++++++---
 mm/memcontrol.c          | 39 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 79 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 7edf9c9ab9eb..2a6b9947aaa3 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -26,7 +26,7 @@ enum lru_status {
 
 struct list_lru_one {
 	struct list_head	list;
-	/* kept as signed so we can catch imbalance bugs */
+	/* may become negative during memcg reparenting */
 	long			nr_items;
 };
 
@@ -62,6 +62,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
 #define list_lru_init_memcg(lru)	__list_lru_init((lru), true, NULL)
 
 int memcg_update_all_list_lrus(int num_memcgs);
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx);
 
 /**
  * list_lru_add: add an element to the lru list's tail
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 8d9d168c6c38..909eca2c820e 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -100,7 +100,6 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
 
 	spin_lock(&nlru->lock);
 	l = list_lru_from_kmem(nlru, item);
-	WARN_ON_ONCE(l->nr_items < 0);
 	if (list_empty(item)) {
 		list_add_tail(item, &l->list);
 		l->nr_items++;
@@ -123,7 +122,6 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 	if (!list_empty(item)) {
 		list_del_init(item);
 		l->nr_items--;
-		WARN_ON_ONCE(l->nr_items < 0);
 		spin_unlock(&nlru->lock);
 		return true;
 	}
@@ -156,7 +154,6 @@ static unsigned long __list_lru_count_one(struct list_lru *lru,
 
 	spin_lock(&nlru->lock);
 	l = list_lru_from_memcg_idx(nlru, memcg_idx);
-	WARN_ON_ONCE(l->nr_items < 0);
 	count = l->nr_items;
 	spin_unlock(&nlru->lock);
 
@@ -458,6 +455,49 @@ fail:
 		memcg_cancel_update_list_lru(lru, old_size, new_size);
 	goto out;
 }
+
+static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
+				      int src_idx, int dst_idx)
+{
+	struct list_lru_one *src, *dst;
+
+	/*
+	 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+	 * we have to use IRQ-safe primitives here to avoid deadlock.
+	 */
+	spin_lock_irq(&nlru->lock);
+
+	src = list_lru_from_memcg_idx(nlru, src_idx);
+	dst = list_lru_from_memcg_idx(nlru, dst_idx);
+
+	list_splice_init(&src->list, &dst->list);
+	dst->nr_items += src->nr_items;
+	src->nr_items = 0;
+
+	spin_unlock_irq(&nlru->lock);
+}
+
+static void memcg_drain_list_lru(struct list_lru *lru,
+				 int src_idx, int dst_idx)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
+}
+
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
+{
+	struct list_lru *lru;
+
+	mutex_lock(&list_lrus_mutex);
+	list_for_each_entry(lru, &list_lrus, list)
+		memcg_drain_list_lru(lru, src_idx, dst_idx);
+	mutex_unlock(&list_lrus_mutex);
+}
 #else
 static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index abfe0135bfdc..419c06b1794a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -334,6 +334,7 @@ struct mem_cgroup {
 #if defined(CONFIG_MEMCG_KMEM)
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
+	bool kmem_acct_activated;
 	bool kmem_acct_active;
 #endif
 
@@ -582,14 +583,10 @@ void memcg_put_cache_ids(void)
 struct static_key memcg_kmem_enabled_key;
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 
-static void memcg_free_cache_id(int id);
-
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
-	if (memcg->kmemcg_id >= 0) {
+	if (memcg->kmem_acct_activated)
 		static_key_slow_dec(&memcg_kmem_enabled_key);
-		memcg_free_cache_id(memcg->kmemcg_id);
-	}
 	/*
 	 * This check can't live in kmem destruction function,
 	 * since the charges will outlive the cgroup
@@ -3322,6 +3319,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
 	int memcg_id;
 
 	BUG_ON(memcg->kmemcg_id >= 0);
+	BUG_ON(memcg->kmem_acct_activated);
 	BUG_ON(memcg->kmem_acct_active);
 
 	/*
@@ -3365,6 +3363,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
 	 * patched.
 	 */
 	memcg->kmemcg_id = memcg_id;
+	memcg->kmem_acct_activated = true;
 	memcg->kmem_acct_active = true;
 out:
 	return err;
@@ -4047,6 +4046,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 
 static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
 {
+	struct cgroup_subsys_state *css;
+	struct mem_cgroup *parent, *child;
+	int kmemcg_id;
+
 	if (!memcg->kmem_acct_active)
 		return;
 
@@ -4059,6 +4062,32 @@ static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
 	memcg->kmem_acct_active = false;
 
 	memcg_deactivate_kmem_caches(memcg);
+
+	kmemcg_id = memcg->kmemcg_id;
+	BUG_ON(kmemcg_id < 0);
+
+	parent = parent_mem_cgroup(memcg);
+	if (!parent)
+		parent = root_mem_cgroup;
+
+	/*
+	 * Change kmemcg_id of this cgroup and all its descendants to the
+	 * parent's id, and then move all entries from this cgroup's list_lrus
+	 * to ones of the parent. After we have finished, all list_lrus
+	 * corresponding to this cgroup are guaranteed to remain empty. The
+	 * ordering is imposed by list_lru_node->lock taken by
+	 * memcg_drain_all_list_lrus().
+	 */
+	css_for_each_descendant_pre(css, &memcg->css) {
+		child = mem_cgroup_from_css(css);
+		BUG_ON(child->kmemcg_id != kmemcg_id);
+		child->kmemcg_id = parent->kmemcg_id;
+		if (!memcg->use_hierarchy)
+			break;
+	}
+	memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+
+	memcg_free_cache_id(kmemcg_id);
 }
 
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
-- 
cgit v1.2.3


From 2d2f5119b8bb057595e18f5b2f07aa097ea1b233 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 12 Feb 2015 14:59:59 -0800
Subject: mm: do not use mm->nr_pmds on !MMU configurations

mm->nr_pmds doesn't make sense on !MMU configurations

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 9 ++++++++-
 kernel/fork.c      | 4 +---
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index af4ff88a11e0..bd52e2f14027 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1447,13 +1447,15 @@ static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd,
 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
 #endif
 
-#ifdef __PAGETABLE_PMD_FOLDED
+#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
 static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 						unsigned long address)
 {
 	return 0;
 }
 
+static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
+
 static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
 {
 	return 0;
@@ -1465,6 +1467,11 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
 #else
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
 
+static inline void mm_nr_pmds_init(struct mm_struct *mm)
+{
+	atomic_long_set(&mm->nr_pmds, 0);
+}
+
 static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
 {
 	return atomic_long_read(&mm->nr_pmds);
diff --git a/kernel/fork.c b/kernel/fork.c
index 66e19c251581..cf65139615a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -555,9 +555,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
 	atomic_long_set(&mm->nr_ptes, 0);
-#ifndef __PAGETABLE_PMD_FOLDED
-	atomic_long_set(&mm->nr_pmds, 0);
-#endif
+	mm_nr_pmds_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
-- 
cgit v1.2.3


From 3eba0c6a56c04f2b017b43641a821f1ebfb7fb4c Mon Sep 17 00:00:00 2001
From: Ganesh Mahendran <opensource.ganesh@gmail.com>
Date: Thu, 12 Feb 2015 15:00:51 -0800
Subject: mm/zpool: add name argument to create zpool

Currently the underlay of zpool: zsmalloc/zbud, do not know who creates
them.  There is not a method to let zsmalloc/zbud find which caller they
belong to.

Now we want to add statistics collection in zsmalloc.  We need to name the
debugfs dir for each pool created.  The way suggested by Minchan Kim is to
use a name passed by caller(such as zram) to create the zsmalloc pool.

    /sys/kernel/debug/zsmalloc/zram0

This patch adds an argument `name' to zs_create_pool() and other related
functions.

Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Seth Jennings <sjennings@variantweb.net>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 8 +++++---
 include/linux/zpool.h         | 5 +++--
 include/linux/zsmalloc.h      | 2 +-
 mm/zbud.c                     | 3 ++-
 mm/zpool.c                    | 6 ++++--
 mm/zsmalloc.c                 | 6 +++---
 mm/zswap.c                    | 5 +++--
 7 files changed, 21 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index eca4b67274c1..8e233edd7a09 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -327,9 +327,10 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize)
 	kfree(meta);
 }
 
-static struct zram_meta *zram_meta_alloc(u64 disksize)
+static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
 {
 	size_t num_pages;
+	char pool_name[8];
 	struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
 
 	if (!meta)
@@ -342,7 +343,8 @@ static struct zram_meta *zram_meta_alloc(u64 disksize)
 		goto out_error;
 	}
 
-	meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
+	snprintf(pool_name, sizeof(pool_name), "zram%d", device_id);
+	meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
 	if (!meta->mem_pool) {
 		pr_err("Error creating memory pool\n");
 		goto out_error;
@@ -783,7 +785,7 @@ static ssize_t disksize_store(struct device *dev,
 		return -EINVAL;
 
 	disksize = PAGE_ALIGN(disksize);
-	meta = zram_meta_alloc(disksize);
+	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
 	if (!meta)
 		return -ENOMEM;
 
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index f14bd75f08b3..56529b34dc63 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -36,7 +36,8 @@ enum zpool_mapmode {
 	ZPOOL_MM_DEFAULT = ZPOOL_MM_RW
 };
 
-struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops);
+struct zpool *zpool_create_pool(char *type, char *name,
+			gfp_t gfp, struct zpool_ops *ops);
 
 char *zpool_get_type(struct zpool *pool);
 
@@ -80,7 +81,7 @@ struct zpool_driver {
 	atomic_t refcount;
 	struct list_head list;
 
-	void *(*create)(gfp_t gfp, struct zpool_ops *ops);
+	void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops);
 	void (*destroy)(void *pool);
 
 	int (*malloc)(void *pool, size_t size, gfp_t gfp,
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 05c214760977..3283c6a55425 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -36,7 +36,7 @@ enum zs_mapmode {
 
 struct zs_pool;
 
-struct zs_pool *zs_create_pool(gfp_t flags);
+struct zs_pool *zs_create_pool(char *name, gfp_t flags);
 void zs_destroy_pool(struct zs_pool *pool);
 
 unsigned long zs_malloc(struct zs_pool *pool, size_t size);
diff --git a/mm/zbud.c b/mm/zbud.c
index 4e387bea702e..2ee4e4520493 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = {
 	.evict =	zbud_zpool_evict
 };
 
-static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zbud_zpool_create(char *name, gfp_t gfp,
+			struct zpool_ops *zpool_ops)
 {
 	return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
 }
diff --git a/mm/zpool.c b/mm/zpool.c
index 739cdf0d183a..bacdab6e47de 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
 /**
  * zpool_create_pool() - Create a new zpool
  * @type	The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @name	The name of the zpool (e.g. zram0, zswap)
  * @gfp		The GFP flags to use when allocating the pool.
  * @ops		The optional ops callback.
  *
@@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver)
  *
  * Returns: New zpool on success, NULL on failure.
  */
-struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
+struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
+		struct zpool_ops *ops)
 {
 	struct zpool_driver *driver;
 	struct zpool *zpool;
@@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
 
 	zpool->type = driver->type;
 	zpool->driver = driver;
-	zpool->pool = driver->create(gfp, ops);
+	zpool->pool = driver->create(name, gfp, ops);
 	zpool->ops = ops;
 
 	if (!zpool->pool) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b72403927aa4..2359e61b02bf 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -246,9 +246,9 @@ struct mapping_area {
 
 #ifdef CONFIG_ZPOOL
 
-static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
 {
-	return zs_create_pool(gfp);
+	return zs_create_pool(name, gfp);
 }
 
 static void zs_zpool_destroy(void *pool)
@@ -1148,7 +1148,7 @@ EXPORT_SYMBOL_GPL(zs_free);
  * On success, a pointer to the newly created pool is returned,
  * otherwise NULL.
  */
-struct zs_pool *zs_create_pool(gfp_t flags)
+struct zs_pool *zs_create_pool(char *name, gfp_t flags)
 {
 	int i;
 	struct zs_pool *pool;
diff --git a/mm/zswap.c b/mm/zswap.c
index 0cfce9bc51e4..4249e82ff934 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -906,11 +906,12 @@ static int __init init_zswap(void)
 
 	pr_info("loading zswap\n");
 
-	zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops);
+	zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
+					&zswap_zpool_ops);
 	if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
 		pr_info("%s zpool not available\n", zswap_zpool_type);
 		zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
-		zswap_pool = zpool_create_pool(zswap_zpool_type, gfp,
+		zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
 					&zswap_zpool_ops);
 	}
 	if (!zswap_pool) {
-- 
cgit v1.2.3


From 695f055936938c674473ea071ca7359a863551e7 Mon Sep 17 00:00:00 2001
From: Petr Cermak <petrcermak@chromium.org>
Date: Thu, 12 Feb 2015 15:01:00 -0800
Subject: fs/proc/task_mmu.c: add user-space support for resetting
 mm->hiwater_rss (peak RSS)

Peak resident size of a process can be reset back to the process's
current rss value by writing "5" to /proc/pid/clear_refs.  The driving
use-case for this would be getting the peak RSS value, which can be
retrieved from the VmHWM field in /proc/pid/status, per benchmark
iteration or test scenario.

[akpm@linux-foundation.org: clarify behaviour in documentation]
Signed-off-by: Petr Cermak <petrcermak@chromium.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Primiano Tucci <primiano@chromium.org>
Cc: Petr Cermak <petrcermak@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  4 ++++
 fs/proc/task_mmu.c                 | 14 ++++++++++++++
 include/linux/mm.h                 |  5 +++++
 3 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index cf8fc2f0b34b..0b3448dba9ec 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -489,6 +489,10 @@ To clear the bits for the file mapped pages associated with the process
 To clear the soft-dirty bit
     > echo 4 > /proc/PID/clear_refs
 
+To reset the peak resident set size ("high water mark") to the process's
+current value:
+    > echo 5 > /proc/PID/clear_refs
+
 Any other value written to /proc/PID/clear_refs will have no effect.
 
 The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0e36c1e49fe3..1359a911d194 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -732,6 +732,7 @@ enum clear_refs_types {
 	CLEAR_REFS_ANON,
 	CLEAR_REFS_MAPPED,
 	CLEAR_REFS_SOFT_DIRTY,
+	CLEAR_REFS_MM_HIWATER_RSS,
 	CLEAR_REFS_LAST,
 };
 
@@ -907,6 +908,18 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 			.mm = mm,
 			.private = &cp,
 		};
+
+		if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+			/*
+			 * Writing 5 to /proc/pid/clear_refs resets the peak
+			 * resident set size to this mm's current rss value.
+			 */
+			down_write(&mm->mmap_sem);
+			reset_mm_hiwater_rss(mm);
+			up_write(&mm->mmap_sem);
+			goto out_mm;
+		}
+
 		down_read(&mm->mmap_sem);
 		if (type == CLEAR_REFS_SOFT_DIRTY) {
 			for (vma = mm->mmap; vma; vma = vma->vm_next) {
@@ -928,6 +941,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 			mmu_notifier_invalidate_range_end(mm, 0, -1);
 		flush_tlb_mm(mm);
 		up_read(&mm->mmap_sem);
+out_mm:
 		mmput(mm);
 	}
 	put_task_struct(task);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bd52e2f14027..9bee7ec0c31f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1408,6 +1408,11 @@ static inline void update_hiwater_vm(struct mm_struct *mm)
 		mm->hiwater_vm = mm->total_vm;
 }
 
+static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
+{
+	mm->hiwater_rss = get_mm_rss(mm);
+}
+
 static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
 					 struct mm_struct *mm)
 {
-- 
cgit v1.2.3


From f56141e3e2d9aabf7e6b89680ab572c2cdbb2a24 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 12 Feb 2015 15:01:14 -0800
Subject: all arches, signal: move restart_block to struct task_struct

If an attacker can cause a controlled kernel stack overflow, overwriting
the restart block is a very juicy exploit target.  This is because the
restart_block is held in the same memory allocation as the kernel stack.

Moving the restart block to struct task_struct prevents this exploit by
making the restart_block harder to locate.

Note that there are other fields in thread_info that are also easy
targets, at least on some architectures.

It's also a decent simplification, since the restart code is more or less
identical on all architectures.

[james.hogan@imgtec.com: metag: align thread_info::supervisor_stack]
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: David Miller <davem@davemloft.net>
Acked-by: Richard Weinberger <richard@nod.at>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Tested-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Chris Zankel <chris@zankel.net>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/thread_info.h      |  5 -----
 arch/alpha/kernel/signal.c                |  2 +-
 arch/arc/include/asm/thread_info.h        |  4 ----
 arch/arc/kernel/signal.c                  |  2 +-
 arch/arm/include/asm/thread_info.h        |  4 ----
 arch/arm/kernel/signal.c                  |  4 ++--
 arch/arm64/include/asm/thread_info.h      |  4 ----
 arch/arm64/kernel/signal.c                |  2 +-
 arch/arm64/kernel/signal32.c              |  4 ++--
 arch/avr32/include/asm/thread_info.h      |  4 ----
 arch/avr32/kernel/asm-offsets.c           |  1 -
 arch/avr32/kernel/signal.c                |  2 +-
 arch/blackfin/include/asm/thread_info.h   |  4 ----
 arch/blackfin/kernel/signal.c             |  2 +-
 arch/c6x/include/asm/thread_info.h        |  4 ----
 arch/c6x/kernel/signal.c                  |  2 +-
 arch/cris/arch-v10/kernel/signal.c        |  2 +-
 arch/cris/arch-v32/kernel/signal.c        |  2 +-
 arch/cris/include/asm/thread_info.h       |  4 ----
 arch/frv/include/asm/thread_info.h        |  4 ----
 arch/frv/kernel/asm-offsets.c             |  1 -
 arch/frv/kernel/signal.c                  |  2 +-
 arch/hexagon/include/asm/thread_info.h    |  4 ----
 arch/hexagon/kernel/signal.c              |  2 +-
 arch/ia64/include/asm/thread_info.h       |  4 ----
 arch/ia64/kernel/signal.c                 |  2 +-
 arch/m32r/include/asm/thread_info.h       |  5 -----
 arch/m32r/kernel/signal.c                 |  2 +-
 arch/m68k/include/asm/thread_info.h       |  4 ----
 arch/m68k/kernel/signal.c                 |  4 ++--
 arch/metag/include/asm/thread_info.h      |  6 +-----
 arch/metag/kernel/signal.c                |  2 +-
 arch/microblaze/include/asm/thread_info.h |  4 ----
 arch/microblaze/kernel/signal.c           |  2 +-
 arch/mips/include/asm/thread_info.h       |  4 ----
 arch/mips/kernel/asm-offsets.c            |  1 -
 arch/mips/kernel/signal.c                 |  2 +-
 arch/mips/kernel/signal32.c               |  2 +-
 arch/mn10300/include/asm/thread_info.h    |  4 ----
 arch/mn10300/kernel/asm-offsets.c         |  1 -
 arch/mn10300/kernel/signal.c              |  2 +-
 arch/openrisc/include/asm/thread_info.h   |  4 ----
 arch/openrisc/kernel/signal.c             |  2 +-
 arch/parisc/include/asm/thread_info.h     |  4 ----
 arch/parisc/kernel/signal.c               |  2 +-
 arch/powerpc/include/asm/thread_info.h    |  4 ----
 arch/powerpc/kernel/signal_32.c           |  4 ++--
 arch/powerpc/kernel/signal_64.c           |  2 +-
 arch/s390/include/asm/thread_info.h       |  4 ----
 arch/s390/kernel/compat_signal.c          |  2 +-
 arch/s390/kernel/signal.c                 |  2 +-
 arch/score/include/asm/thread_info.h      |  4 ----
 arch/score/kernel/asm-offsets.c           |  1 -
 arch/score/kernel/signal.c                |  2 +-
 arch/sh/include/asm/thread_info.h         |  4 ----
 arch/sh/kernel/asm-offsets.c              |  1 -
 arch/sh/kernel/signal_32.c                |  4 ++--
 arch/sh/kernel/signal_64.c                |  4 ++--
 arch/sparc/include/asm/thread_info_32.h   |  6 ------
 arch/sparc/include/asm/thread_info_64.h   | 12 +++---------
 arch/sparc/kernel/signal32.c              |  4 ++--
 arch/sparc/kernel/signal_32.c             |  2 +-
 arch/sparc/kernel/signal_64.c             |  2 +-
 arch/sparc/kernel/traps_64.c              |  2 --
 arch/tile/include/asm/thread_info.h       |  4 ----
 arch/tile/kernel/signal.c                 |  2 +-
 arch/um/include/asm/thread_info.h         |  4 ----
 arch/unicore32/include/asm/thread_info.h  |  4 ----
 arch/unicore32/kernel/signal.c            |  2 +-
 arch/x86/ia32/ia32_signal.c               |  2 +-
 arch/x86/include/asm/thread_info.h        |  4 ----
 arch/x86/kernel/signal.c                  |  2 +-
 arch/x86/um/signal.c                      |  2 +-
 arch/xtensa/include/asm/thread_info.h     |  5 -----
 arch/xtensa/kernel/signal.c               |  2 +-
 fs/select.c                               |  2 +-
 include/linux/init_task.h                 |  3 +++
 include/linux/sched.h                     |  2 ++
 kernel/compat.c                           |  5 ++---
 kernel/futex.c                            |  2 +-
 kernel/signal.c                           |  2 +-
 kernel/time/alarmtimer.c                  |  2 +-
 kernel/time/hrtimer.c                     |  2 +-
 kernel/time/posix-cpu-timers.c            |  3 +--
 84 files changed, 62 insertions(+), 194 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h
index 48bbea6898b3..d5b98ab514bb 100644
--- a/arch/alpha/include/asm/thread_info.h
+++ b/arch/alpha/include/asm/thread_info.h
@@ -27,8 +27,6 @@ struct thread_info {
 	int bpt_nsaved;
 	unsigned long bpt_addr[2];		/* breakpoint handling  */
 	unsigned int bpt_insn[2];
-
-	struct restart_block	restart_block;
 };
 
 /*
@@ -40,9 +38,6 @@ struct thread_info {
 	.exec_domain	= &default_exec_domain,	\
 	.addr_limit	= KERNEL_DS,		\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 6cec2881acbf..8dbfb15f1745 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -150,7 +150,7 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs)
 	struct switch_stack *sw = (struct switch_stack *)regs - 1;
 	long i, err = __get_user(regs->pc, &sc->sc_pc);
 
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	sw->r26 = (unsigned long) ret_from_sys_call;
 
diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h
index 02bc5ec0fb2e..1163a1838ac1 100644
--- a/arch/arc/include/asm/thread_info.h
+++ b/arch/arc/include/asm/thread_info.h
@@ -46,7 +46,6 @@ struct thread_info {
 	struct exec_domain *exec_domain;/* execution domain */
 	__u32 cpu;			/* current CPU */
 	unsigned long thr_ptr;		/* TLS ptr */
-	struct restart_block restart_block;
 };
 
 /*
@@ -62,9 +61,6 @@ struct thread_info {
 	.cpu        = 0,			\
 	.preempt_count  = INIT_PREEMPT_COUNT,	\
 	.addr_limit = KERNEL_DS,		\
-	.restart_block  = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info    (init_thread_union.thread_info)
diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c
index cb3142a2d40b..114234e83caa 100644
--- a/arch/arc/kernel/signal.c
+++ b/arch/arc/kernel/signal.c
@@ -104,7 +104,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	struct pt_regs *regs = current_pt_regs();
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* Since we stacked the signal on a word boundary,
 	 * then 'sp' should be word aligned here.  If it's
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index d890e41f5520..72812a1f3d1c 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -68,7 +68,6 @@ struct thread_info {
 #ifdef CONFIG_ARM_THUMBEE
 	unsigned long		thumbee_state;	/* ThumbEE Handler Base register */
 #endif
-	struct restart_block	restart_block;
 };
 
 #define INIT_THREAD_INFO(tsk)						\
@@ -81,9 +80,6 @@ struct thread_info {
 	.cpu_domain	= domain_val(DOMAIN_USER, DOMAIN_MANAGER) |	\
 			  domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) |	\
 			  domain_val(DOMAIN_IO, DOMAIN_CLIENT),		\
-	.restart_block	= {						\
-		.fn	= do_no_restart_syscall,			\
-	},								\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 8aa6f1b87c9e..023ac905e4c3 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -191,7 +191,7 @@ asmlinkage int sys_sigreturn(struct pt_regs *regs)
 	struct sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
@@ -221,7 +221,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
 	struct rt_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 459bf8e53208..702e1e6a0d80 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -48,7 +48,6 @@ struct thread_info {
 	mm_segment_t		addr_limit;	/* address limit */
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain	*exec_domain;	/* execution domain */
-	struct restart_block	restart_block;
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
 	int			cpu;		/* cpu */
 };
@@ -60,9 +59,6 @@ struct thread_info {
 	.flags		= 0,						\
 	.preempt_count	= INIT_PREEMPT_COUNT,				\
 	.addr_limit	= KERNEL_DS,					\
-	.restart_block	= {						\
-		.fn	= do_no_restart_syscall,			\
-	},								\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 6fa792137eda..660ccf9f7524 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -131,7 +131,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	struct rt_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 128-bit boundary, then 'sp' should
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index e299de396e9b..c20a300e2213 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -347,7 +347,7 @@ asmlinkage int compat_sys_sigreturn(struct pt_regs *regs)
 	struct compat_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
@@ -381,7 +381,7 @@ asmlinkage int compat_sys_rt_sigreturn(struct pt_regs *regs)
 	struct compat_rt_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h
index a978f3fe7c25..d56afa99a514 100644
--- a/arch/avr32/include/asm/thread_info.h
+++ b/arch/avr32/include/asm/thread_info.h
@@ -30,7 +30,6 @@ struct thread_info {
 						   saved by debug handler
 						   when setting up
 						   trampoline */
-	struct restart_block	restart_block;
 	__u8			supervisor_stack[0];
 };
 
@@ -41,9 +40,6 @@ struct thread_info {
 	.flags		= 0,						\
 	.cpu		= 0,						\
 	.preempt_count	= INIT_PREEMPT_COUNT,				\
-	.restart_block	= {						\
-		.fn	= do_no_restart_syscall				\
-	}								\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/avr32/kernel/asm-offsets.c b/arch/avr32/kernel/asm-offsets.c
index d6a8193a1d2f..e41c84516e5d 100644
--- a/arch/avr32/kernel/asm-offsets.c
+++ b/arch/avr32/kernel/asm-offsets.c
@@ -18,7 +18,6 @@ void foo(void)
 	OFFSET(TI_preempt_count, thread_info, preempt_count);
 	OFFSET(TI_rar_saved, thread_info, rar_saved);
 	OFFSET(TI_rsr_saved, thread_info, rsr_saved);
-	OFFSET(TI_restart_block, thread_info, restart_block);
 	BLANK();
 	OFFSET(TSK_active_mm, task_struct, active_mm);
 	BLANK();
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index d309fbcc3bd6..8f1c63b9b983 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -69,7 +69,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
 	sigset_t set;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	frame = (struct rt_sigframe __user *)regs->sp;
 	pr_debug("SIG return: frame = %p\n", frame);
diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h
index 55f473bdad36..57c3a8bd583d 100644
--- a/arch/blackfin/include/asm/thread_info.h
+++ b/arch/blackfin/include/asm/thread_info.h
@@ -42,7 +42,6 @@ struct thread_info {
 	int cpu;		/* cpu we're on */
 	int preempt_count;	/* 0 => preemptable, <0 => BUG */
 	mm_segment_t addr_limit;	/* address limit */
-	struct restart_block restart_block;
 #ifndef CONFIG_SMP
 	struct l1_scratch_task_info l1_task_info;
 #endif
@@ -58,9 +57,6 @@ struct thread_info {
 	.flags		= 0,			\
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index ef275571d885..f2a8b5493bd3 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -44,7 +44,7 @@ rt_restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *p
 	int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 #define RESTORE(x) err |= __get_user(regs->x, &sc->sc_##x)
 
diff --git a/arch/c6x/include/asm/thread_info.h b/arch/c6x/include/asm/thread_info.h
index d4e9ef87076d..584e253f3217 100644
--- a/arch/c6x/include/asm/thread_info.h
+++ b/arch/c6x/include/asm/thread_info.h
@@ -45,7 +45,6 @@ struct thread_info {
 	int			cpu;		/* cpu we're on */
 	int			preempt_count;	/* 0 = preemptable, <0 = BUG */
 	mm_segment_t		addr_limit;	/* thread address space */
-	struct restart_block	restart_block;
 };
 
 /*
@@ -61,9 +60,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index fe68226f6c4d..3c4bb5a5c382 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -68,7 +68,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs)
 	sigset_t set;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a dword boundary,
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index 9b32d338838b..74d7ba35120d 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -67,7 +67,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 	unsigned long old_usp;
 
         /* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* restore the regs from &sc->regs (same as sc, since regs is first)
 	 * (sc is already checked for VERIFY_READ since the sigframe was
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index 78ce3b1c9bcb..870e3e069318 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -59,7 +59,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 	unsigned long old_usp;
 
         /* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Restore the registers from &sc->regs. sc is already checked
diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h
index 55dede18c032..7286db5ed90e 100644
--- a/arch/cris/include/asm/thread_info.h
+++ b/arch/cris/include/asm/thread_info.h
@@ -38,7 +38,6 @@ struct thread_info {
 					 	   0-0xBFFFFFFF for user-thead
 						   0-0xFFFFFFFF for kernel-thread
 						*/
-	struct restart_block    restart_block;
 	__u8			supervisor_stack[0];
 };
 
@@ -56,9 +55,6 @@ struct thread_info {
 	.cpu		= 0,				\
 	.preempt_count	= INIT_PREEMPT_COUNT,		\
 	.addr_limit	= KERNEL_DS,			\
-	.restart_block = {				\
-		       .fn = do_no_restart_syscall,	\
-	},						\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h
index af29e17c0181..6b917f1c2955 100644
--- a/arch/frv/include/asm/thread_info.h
+++ b/arch/frv/include/asm/thread_info.h
@@ -41,7 +41,6 @@ struct thread_info {
 						 * 0-0xBFFFFFFF for user-thead
 						 * 0-0xFFFFFFFF for kernel-thread
 						 */
-	struct restart_block    restart_block;
 
 	__u8			supervisor_stack[0];
 };
@@ -65,9 +64,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/frv/kernel/asm-offsets.c b/arch/frv/kernel/asm-offsets.c
index 9de96843a278..446e89d500cc 100644
--- a/arch/frv/kernel/asm-offsets.c
+++ b/arch/frv/kernel/asm-offsets.c
@@ -40,7 +40,6 @@ void foo(void)
 	OFFSET(TI_CPU,			thread_info, cpu);
 	OFFSET(TI_PREEMPT_COUNT,	thread_info, preempt_count);
 	OFFSET(TI_ADDR_LIMIT,		thread_info, addr_limit);
-	OFFSET(TI_RESTART_BLOCK,	thread_info, restart_block);
 	BLANK();
 
 	/* offsets into register file storage */
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index dc3d59de0870..336713ab4745 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -62,7 +62,7 @@ static int restore_sigcontext(struct sigcontext __user *sc, int *_gr8)
 	unsigned long tbr, psr;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	tbr = user->i.tbr;
 	psr = user->i.psr;
diff --git a/arch/hexagon/include/asm/thread_info.h b/arch/hexagon/include/asm/thread_info.h
index a59dad3b3695..bacd3d6030c5 100644
--- a/arch/hexagon/include/asm/thread_info.h
+++ b/arch/hexagon/include/asm/thread_info.h
@@ -56,7 +56,6 @@ struct thread_info {
 	 * used for syscalls somehow;
 	 * seems to have a function pointer and four arguments
 	 */
-	struct restart_block    restart_block;
 	/* Points to the current pt_regs frame  */
 	struct pt_regs		*regs;
 	/*
@@ -83,9 +82,6 @@ struct thread_info {
 	.cpu            = 0,                    \
 	.preempt_count  = 1,                    \
 	.addr_limit     = KERNEL_DS,            \
-	.restart_block = {                      \
-		.fn = do_no_restart_syscall,    \
-	},                                      \
 	.sp = 0,				\
 	.regs = NULL,			\
 }
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index eadd70e47e7e..b039a624c170 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -239,7 +239,7 @@ asmlinkage int sys_rt_sigreturn(void)
 	sigset_t blocked;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	frame = (struct rt_sigframe __user *)pt_psp(regs);
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index 5b17418b4223..c16f21a068ff 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -27,7 +27,6 @@ struct thread_info {
 	__u32 status;			/* Thread synchronous flags */
 	mm_segment_t addr_limit;	/* user-level address space limit */
 	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
-	struct restart_block restart_block;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	__u64 ac_stamp;
 	__u64 ac_leave;
@@ -46,9 +45,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.addr_limit	= KERNEL_DS,		\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #ifndef ASM_OFFSETS_C
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 6d92170be457..b3a124da71e5 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -46,7 +46,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
 	long err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* restore scratch that always needs gets updated during signal delivery: */
 	err  = __get_user(flags, &sc->sc_flags);
diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h
index 00171703402f..32422d0211c3 100644
--- a/arch/m32r/include/asm/thread_info.h
+++ b/arch/m32r/include/asm/thread_info.h
@@ -34,7 +34,6 @@ struct thread_info {
 					 	   0-0xBFFFFFFF for user-thread
 						   0-0xFFFFFFFF for kernel-thread
 						*/
-	struct restart_block    restart_block;
 
 	__u8			supervisor_stack[0];
 };
@@ -49,7 +48,6 @@ struct thread_info {
 #define TI_CPU		0x00000010
 #define TI_PRE_COUNT	0x00000014
 #define TI_ADDR_LIMIT	0x00000018
-#define TI_RESTART_BLOCK 0x000001C
 
 #endif
 
@@ -68,9 +66,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 95408b8f130a..7736c6660a15 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -48,7 +48,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 #define COPY(x)		err |= __get_user(regs->x, &sc->sc_##x)
 	COPY(r4);
diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h
index 21a4784ca5a1..c54256e69e64 100644
--- a/arch/m68k/include/asm/thread_info.h
+++ b/arch/m68k/include/asm/thread_info.h
@@ -31,7 +31,6 @@ struct thread_info {
 	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	__u32			cpu;		/* should always be 0 on m68k */
 	unsigned long		tp_value;	/* thread pointer */
-	struct restart_block    restart_block;
 };
 #endif /* __ASSEMBLY__ */
 
@@ -41,9 +40,6 @@ struct thread_info {
 	.exec_domain	= &default_exec_domain,	\
 	.addr_limit	= KERNEL_DS,		\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_stack		(init_thread_union.stack)
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 967a8b7e1527..d7179281e74a 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -655,7 +655,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u
 	int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* get previous context */
 	if (copy_from_user(&context, usc, sizeof(context)))
@@ -693,7 +693,7 @@ rt_restore_ucontext(struct pt_regs *regs, struct switch_stack *sw,
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err = __get_user(temp, &uc->uc_mcontext.version);
 	if (temp != MCONTEXT_VERSION)
diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h
index 47711336119e..afb3ca4776d1 100644
--- a/arch/metag/include/asm/thread_info.h
+++ b/arch/metag/include/asm/thread_info.h
@@ -35,9 +35,8 @@ struct thread_info {
 	int preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t addr_limit;	/* thread address space */
-	struct restart_block restart_block;
 
-	u8 supervisor_stack[0];
+	u8 supervisor_stack[0] __aligned(8);
 };
 
 #else /* !__ASSEMBLY__ */
@@ -74,9 +73,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/metag/kernel/signal.c b/arch/metag/kernel/signal.c
index 0d100d5c1407..ce49d429c74a 100644
--- a/arch/metag/kernel/signal.c
+++ b/arch/metag/kernel/signal.c
@@ -48,7 +48,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err = metag_gp_regs_copyin(regs, 0, sizeof(struct user_gp_regs), NULL,
 				   &sc->regs);
diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h
index 8c9d36591a03..b699fbd7de4a 100644
--- a/arch/microblaze/include/asm/thread_info.h
+++ b/arch/microblaze/include/asm/thread_info.h
@@ -71,7 +71,6 @@ struct thread_info {
 	__u32			cpu; /* current CPU */
 	__s32			preempt_count; /* 0 => preemptable,< 0 => BUG*/
 	mm_segment_t		addr_limit; /* thread address space */
-	struct restart_block	restart_block;
 
 	struct cpu_context	cpu_context;
 };
@@ -87,9 +86,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 235706055b7f..a1cbaf90e2ea 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -89,7 +89,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	int rval;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
index e4440f92b366..9e1295f874f0 100644
--- a/arch/mips/include/asm/thread_info.h
+++ b/arch/mips/include/asm/thread_info.h
@@ -34,7 +34,6 @@ struct thread_info {
 						 * 0x7fffffff for user-thead
 						 * 0xffffffff for kernel-thread
 						 */
-	struct restart_block	restart_block;
 	struct pt_regs		*regs;
 	long			syscall;	/* syscall number */
 };
@@ -50,9 +49,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index b1d84bd4efb3..3b2dfdb4865f 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -98,7 +98,6 @@ void output_thread_info_defines(void)
 	OFFSET(TI_CPU, thread_info, cpu);
 	OFFSET(TI_PRE_COUNT, thread_info, preempt_count);
 	OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit);
-	OFFSET(TI_RESTART_BLOCK, thread_info, restart_block);
 	OFFSET(TI_REGS, thread_info, regs);
 	DEFINE(_THREAD_SIZE, THREAD_SIZE);
 	DEFINE(_THREAD_MASK, THREAD_MASK);
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 545bf11bd2ed..6a28c792d862 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -243,7 +243,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 	int i;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err |= __get_user(regs->cp0_epc, &sc->sc_pc);
 
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index d69179c0d49d..19a7705f2a01 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -220,7 +220,7 @@ static int restore_sigcontext32(struct pt_regs *regs,
 	int i;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err |= __get_user(regs->cp0_epc, &sc->sc_pc);
 	err |= __get_user(regs->hi, &sc->sc_mdhi);
diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h
index bf280eaccd36..c1c374f0ec12 100644
--- a/arch/mn10300/include/asm/thread_info.h
+++ b/arch/mn10300/include/asm/thread_info.h
@@ -50,7 +50,6 @@ struct thread_info {
 						   0-0xBFFFFFFF for user-thead
 						   0-0xFFFFFFFF for kernel-thread
 						*/
-	struct restart_block    restart_block;
 
 	__u8			supervisor_stack[0];
 };
@@ -80,9 +79,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/mn10300/kernel/asm-offsets.c b/arch/mn10300/kernel/asm-offsets.c
index 47b3bb0c04ff..d780670cbaf3 100644
--- a/arch/mn10300/kernel/asm-offsets.c
+++ b/arch/mn10300/kernel/asm-offsets.c
@@ -28,7 +28,6 @@ void foo(void)
 	OFFSET(TI_cpu,			thread_info, cpu);
 	OFFSET(TI_preempt_count,	thread_info, preempt_count);
 	OFFSET(TI_addr_limit,		thread_info, addr_limit);
-	OFFSET(TI_restart_block,	thread_info, restart_block);
 	BLANK();
 
 	OFFSET(REG_D0,			pt_regs, d0);
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index a6c0858592c3..8609845f12c5 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -40,7 +40,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (is_using_fpu(current))
 		fpu_kill_state(current);
diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h
index d797acc901e4..875f0845a707 100644
--- a/arch/openrisc/include/asm/thread_info.h
+++ b/arch/openrisc/include/asm/thread_info.h
@@ -57,7 +57,6 @@ struct thread_info {
 					       0-0x7FFFFFFF for user-thead
 					       0-0xFFFFFFFF for kernel-thread
 					     */
-	struct restart_block    restart_block;
 	__u8			supervisor_stack[0];
 
 	/* saved context data */
@@ -79,9 +78,6 @@ struct thread_info {
 	.cpu		= 0,				\
 	.preempt_count	= 1,				\
 	.addr_limit	= KERNEL_DS,			\
-	.restart_block  = {				\
-			  .fn = do_no_restart_syscall,	\
-	},						\
 	.ksp            = 0,                            \
 }
 
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 7d1b8235bf90..4112175bf803 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -46,7 +46,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 	int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Restore the regs from &sc->regs.
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index a84611835549..fb13e3865563 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -14,7 +14,6 @@ struct thread_info {
 	mm_segment_t addr_limit;	/* user-level address space limit */
 	__u32 cpu;			/* current CPU */
 	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
-	struct restart_block restart_block;
 };
 
 #define INIT_THREAD_INFO(tsk)			\
@@ -25,9 +24,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.addr_limit	= KERNEL_DS,		\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-  	.restart_block	= {			\
-		.fn = do_no_restart_syscall	\
-	}					\
 }
 
 #define init_thread_info        (init_thread_union.thread_info)
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 012d4fa63d97..9b910a0251b8 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -99,7 +99,7 @@ sys_rt_sigreturn(struct pt_regs *regs, int in_syscall)
 		sigframe_size = PARISC_RT_SIGFRAME_SIZE32;
 #endif
 
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* Unwind the user stack to get the rt_sigframe structure. */
 	frame = (struct rt_sigframe __user *)
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index e8abc83e699f..72489799cf02 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -43,7 +43,6 @@ struct thread_info {
 	int		cpu;			/* cpu we're on */
 	int		preempt_count;		/* 0 => preemptable,
 						   <0 => BUG */
-	struct restart_block restart_block;
 	unsigned long	local_flags;		/* private flags for thread */
 
 	/* low level flags - has atomic operations done on it */
@@ -59,9 +58,6 @@ struct thread_info {
 	.exec_domain =	&default_exec_domain,	\
 	.cpu =		0,			\
 	.preempt_count = INIT_PREEMPT_COUNT,	\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 	.flags =	0,			\
 }
 
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index b171001698ff..d3a831ac0f92 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1231,7 +1231,7 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 	int tm_restore = 0;
 #endif
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	rt_sf = (struct rt_sigframe __user *)
 		(regs->gpr[1] + __SIGNAL_FRAMESIZE + 16);
@@ -1504,7 +1504,7 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 #endif
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
 	sc = &sf->sctx;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 2cb0c94cafa5..c7c24d2e2bdb 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -666,7 +666,7 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
 #endif
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, uc, sizeof(*uc)))
 		goto badframe;
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index 4d62fd5b56e5..ef1df718642d 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -39,7 +39,6 @@ struct thread_info {
 	unsigned long		sys_call_table;	/* System call table address */
 	unsigned int		cpu;		/* current CPU */
 	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
-	struct restart_block	restart_block;
 	unsigned int		system_call;
 	__u64			user_timer;
 	__u64			system_timer;
@@ -56,9 +55,6 @@ struct thread_info {
 	.flags		= 0,			\
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 34d5fa7b01b5..bc1df12dd4f8 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -209,7 +209,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
 	int i;
 
 	/* Alwys make any pending restarted system call return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs)))
 		return -EFAULT;
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 6a2ac257d98f..b3ae6f70c6d6 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -162,7 +162,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
 	_sigregs user_sregs;
 
 	/* Alwys make any pending restarted system call return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs)))
 		return -EFAULT;
diff --git a/arch/score/include/asm/thread_info.h b/arch/score/include/asm/thread_info.h
index 656b7ada9326..33864fa2a8d4 100644
--- a/arch/score/include/asm/thread_info.h
+++ b/arch/score/include/asm/thread_info.h
@@ -42,7 +42,6 @@ struct thread_info {
 	 * 0-0xFFFFFFFF for kernel-thread
 	 */
 	mm_segment_t		addr_limit;
-	struct restart_block	restart_block;
 	struct pt_regs		*regs;
 };
 
@@ -58,9 +57,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= 1,			\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/score/kernel/asm-offsets.c b/arch/score/kernel/asm-offsets.c
index 57788f44c6fb..b4d5214a7a7e 100644
--- a/arch/score/kernel/asm-offsets.c
+++ b/arch/score/kernel/asm-offsets.c
@@ -106,7 +106,6 @@ void output_thread_info_defines(void)
 	OFFSET(TI_CPU, thread_info, cpu);
 	OFFSET(TI_PRE_COUNT, thread_info, preempt_count);
 	OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit);
-	OFFSET(TI_RESTART_BLOCK, thread_info, restart_block);
 	OFFSET(TI_REGS, thread_info, regs);
 	DEFINE(KERNEL_STACK_SIZE, THREAD_SIZE);
 	DEFINE(KERNEL_STACK_MASK, THREAD_MASK);
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index 1651807774ad..e381c8c4ff65 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -141,7 +141,7 @@ score_rt_sigreturn(struct pt_regs *regs)
 	int sig;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	frame = (struct rt_sigframe __user *) regs->regs[0];
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h
index ad27ffa65e2e..657c03919627 100644
--- a/arch/sh/include/asm/thread_info.h
+++ b/arch/sh/include/asm/thread_info.h
@@ -33,7 +33,6 @@ struct thread_info {
 	__u32			cpu;
 	int			preempt_count; /* 0 => preemptable, <0 => BUG */
 	mm_segment_t		addr_limit;	/* thread address space */
-	struct restart_block	restart_block;
 	unsigned long		previous_sp;	/* sp of previous stack in case
 						   of nested IRQ stacks */
 	__u8			supervisor_stack[0];
@@ -63,9 +62,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c
index 08a2be775b6c..542225fedb11 100644
--- a/arch/sh/kernel/asm-offsets.c
+++ b/arch/sh/kernel/asm-offsets.c
@@ -25,7 +25,6 @@ int main(void)
 	DEFINE(TI_FLAGS,	offsetof(struct thread_info, flags));
 	DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
 	DEFINE(TI_PRE_COUNT,	offsetof(struct thread_info, preempt_count));
-	DEFINE(TI_RESTART_BLOCK,offsetof(struct thread_info, restart_block));
 	DEFINE(TI_SIZE,		sizeof(struct thread_info));
 
 #ifdef CONFIG_HIBERNATION
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 2f002b24fb92..0b34f2a704fe 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -156,7 +156,7 @@ asmlinkage int sys_sigreturn(void)
 	int r0;
 
         /* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -186,7 +186,7 @@ asmlinkage int sys_rt_sigreturn(void)
 	int r0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index 897abe7b871e..71993c6a7d94 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -260,7 +260,7 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3,
 	long long ret;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -294,7 +294,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3,
 	long long ret;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h
index 025c98446b1e..fd7bd0a440ca 100644
--- a/arch/sparc/include/asm/thread_info_32.h
+++ b/arch/sparc/include/asm/thread_info_32.h
@@ -47,8 +47,6 @@ struct thread_info {
 	struct reg_window32	reg_window[NSWINS];	/* align for ldd! */
 	unsigned long		rwbuf_stkptrs[NSWINS];
 	unsigned long		w_saved;
-
-	struct restart_block	restart_block;
 };
 
 /*
@@ -62,9 +60,6 @@ struct thread_info {
 	.flags		=	0,			\
 	.cpu		=	0,			\
 	.preempt_count	=	INIT_PREEMPT_COUNT,	\
-	.restart_block	= {				\
-		.fn	=	do_no_restart_syscall,	\
-	},						\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
@@ -103,7 +98,6 @@ register struct thread_info *current_thread_info_reg asm("g6");
 #define TI_REG_WINDOW	0x30
 #define TI_RWIN_SPTRS	0x230
 #define TI_W_SAVED	0x250
-/* #define TI_RESTART_BLOCK 0x25n */ /* Nobody cares */
 
 /*
  * thread information flag bit numbers
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 798f0279a4b5..ff455164732a 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -58,8 +58,6 @@ struct thread_info {
 	unsigned long		gsr[7];
 	unsigned long		xfsr[7];
 
-	struct restart_block	restart_block;
-
 	struct pt_regs		*kern_una_regs;
 	unsigned int		kern_una_insn;
 
@@ -92,10 +90,9 @@ struct thread_info {
 #define TI_RWIN_SPTRS	0x000003c8
 #define TI_GSR		0x00000400
 #define TI_XFSR		0x00000438
-#define TI_RESTART_BLOCK 0x00000470
-#define TI_KUNA_REGS	0x000004a0
-#define TI_KUNA_INSN	0x000004a8
-#define TI_FPREGS	0x000004c0
+#define TI_KUNA_REGS	0x00000470
+#define TI_KUNA_INSN	0x00000478
+#define TI_FPREGS	0x00000480
 
 /* We embed this in the uppermost byte of thread_info->flags */
 #define FAULT_CODE_WRITE	0x01	/* Write access, implies D-TLB	   */
@@ -124,9 +121,6 @@ struct thread_info {
 	.current_ds	=	ASI_P,			\
 	.exec_domain	=	&default_exec_domain,	\
 	.preempt_count	=	INIT_PREEMPT_COUNT,	\
-	.restart_block	= {				\
-		.fn	=	do_no_restart_syscall,	\
-	},						\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 62deba7be1a9..4eed773a7735 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -150,7 +150,7 @@ void do_sigreturn32(struct pt_regs *regs)
 	int err, i;
 	
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	synchronize_user_stack();
 
@@ -235,7 +235,7 @@ asmlinkage void do_rt_sigreturn32(struct pt_regs *regs)
 	int err, i;
 	
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	synchronize_user_stack();
 	regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 9ee72fc8e0e4..52aa5e4ce5e7 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -70,7 +70,7 @@ asmlinkage void do_sigreturn(struct pt_regs *regs)
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	synchronize_user_stack();
 
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 1a6999868031..d88beff47bab 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -254,7 +254,7 @@ void do_rt_sigreturn(struct pt_regs *regs)
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	synchronize_user_stack ();
 	sf = (struct rt_signal_frame __user *)
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 981a769b9558..a27651e866e7 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2730,8 +2730,6 @@ void __init trap_init(void)
 		     TI_NEW_CHILD != offsetof(struct thread_info, new_child) ||
 		     TI_CURRENT_DS != offsetof(struct thread_info,
 						current_ds) ||
-		     TI_RESTART_BLOCK != offsetof(struct thread_info,
-						  restart_block) ||
 		     TI_KUNA_REGS != offsetof(struct thread_info,
 					      kern_una_regs) ||
 		     TI_KUNA_INSN != offsetof(struct thread_info,
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index 48e4fd0f38e4..96c14c1430d8 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -36,7 +36,6 @@ struct thread_info {
 
 	mm_segment_t		addr_limit;	/* thread address space
 						   (KERNEL_DS or USER_DS) */
-	struct restart_block	restart_block;
 	struct single_step_state *step_state;	/* single step state
 						   (if non-zero) */
 	int			align_ctl;	/* controls unaligned access */
@@ -57,9 +56,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 	.step_state	= NULL,			\
 	.align_ctl	= 0,			\
 }
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index bb0a9ce7ae23..8a524e332c1a 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -48,7 +48,7 @@ int restore_sigcontext(struct pt_regs *regs,
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Enforce that sigcontext is like pt_regs, and doesn't mess
diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h
index 1c5b2a83046a..e04114c4fcd9 100644
--- a/arch/um/include/asm/thread_info.h
+++ b/arch/um/include/asm/thread_info.h
@@ -22,7 +22,6 @@ struct thread_info {
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user
 						   0-0xFFFFFFFF for kernel */
-	struct restart_block    restart_block;
 	struct thread_info	*real_thread;    /* Points to non-IRQ stack */
 };
 
@@ -34,9 +33,6 @@ struct thread_info {
 	.cpu =		0,			\
 	.preempt_count = INIT_PREEMPT_COUNT,	\
 	.addr_limit =	KERNEL_DS,		\
-	.restart_block =  {			\
-		.fn =  do_no_restart_syscall,	\
-	},					\
 	.real_thread = NULL,			\
 }
 
diff --git a/arch/unicore32/include/asm/thread_info.h b/arch/unicore32/include/asm/thread_info.h
index af36d8eabdf1..63e2839dfeb8 100644
--- a/arch/unicore32/include/asm/thread_info.h
+++ b/arch/unicore32/include/asm/thread_info.h
@@ -79,7 +79,6 @@ struct thread_info {
 #ifdef CONFIG_UNICORE_FPU_F64
 	struct fp_state		fpstate __attribute__((aligned(8)));
 #endif
-	struct restart_block	restart_block;
 };
 
 #define INIT_THREAD_INFO(tsk)						\
@@ -89,9 +88,6 @@ struct thread_info {
 	.flags		= 0,						\
 	.preempt_count	= INIT_PREEMPT_COUNT,				\
 	.addr_limit	= KERNEL_DS,					\
-	.restart_block	= {						\
-		.fn	= do_no_restart_syscall,			\
-	},								\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index 7c8fb7018dc6..d329f85766cc 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -105,7 +105,7 @@ asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs)
 	struct rt_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f9e181aaba97..d0165c9a2932 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -169,7 +169,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 	u32 tmp;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	get_user_try {
 		/*
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e82e95abc92b..1d4e4f279a32 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -31,7 +31,6 @@ struct thread_info {
 	__u32			cpu;		/* current CPU */
 	int			saved_preempt_count;
 	mm_segment_t		addr_limit;
-	struct restart_block    restart_block;
 	void __user		*sysenter_return;
 	unsigned int		sig_on_uaccess_error:1;
 	unsigned int		uaccess_err:1;	/* uaccess failed */
@@ -45,9 +44,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.saved_preempt_count = INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 2a33c8f68319..e5042463c1bc 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	get_user_try {
 
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 79d824551c1a..0c8c32bfd792 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -157,7 +157,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
 	int err, pid;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err = copy_from_user(&sc, from, sizeof(sc));
 	if (err)
diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h
index 470153e8547c..a9b5d3ba196c 100644
--- a/arch/xtensa/include/asm/thread_info.h
+++ b/arch/xtensa/include/asm/thread_info.h
@@ -51,7 +51,6 @@ struct thread_info {
 	__s32			preempt_count;	/* 0 => preemptable,< 0 => BUG*/
 
 	mm_segment_t		addr_limit;	/* thread address space */
-	struct restart_block    restart_block;
 
 	unsigned long		cpenable;
 
@@ -72,7 +71,6 @@ struct thread_info {
 #define TI_CPU		 0x00000010
 #define TI_PRE_COUNT	 0x00000014
 #define TI_ADDR_LIMIT	 0x00000018
-#define TI_RESTART_BLOCK 0x000001C
 
 #endif
 
@@ -90,9 +88,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index 4612321c73cc..3d733ba16f28 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -245,7 +245,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3,
 	int ret;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (regs->depc > 64)
 		panic("rt_sigreturn in double exception!\n");
diff --git a/fs/select.c b/fs/select.c
index 467bb1cb3ea5..f684c750e08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -971,7 +971,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 	if (ret == -EINTR) {
 		struct restart_block *restart_block;
 
-		restart_block = &current_thread_info()->restart_block;
+		restart_block = &current->restart_block;
 		restart_block->fn = do_restart_poll;
 		restart_block->poll.ufds = ufds;
 		restart_block->poll.nfds = nfds;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 3037fc085e8e..d3d43ecf148c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -193,6 +193,9 @@ extern struct task_group root_task_group;
 	.nr_cpus_allowed= NR_CPUS,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
+	.restart_block = {						\
+		.fn = do_no_restart_syscall,				\
+	},								\
 	.se		= {						\
 		.group_node 	= LIST_HEAD_INIT(tsk.se.group_node),	\
 	},								\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef98d2f..22ee0d5d7f8c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1370,6 +1370,8 @@ struct task_struct {
 
 	unsigned long atomic_flags; /* Flags needing atomic access. */
 
+	struct restart_block restart_block;
+
 	pid_t pid;
 	pid_t tgid;
 
diff --git a/kernel/compat.c b/kernel/compat.c
index ebb3c369d03d..24f00610c575 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
 	 * core implementation decides to return random nonsense.
 	 */
 	if (ret == -ERESTART_RESTARTBLOCK) {
-		struct restart_block *restart
-			= &current_thread_info()->restart_block;
+		struct restart_block *restart = &current->restart_block;
 
 		restart->fn = compat_nanosleep_restart;
 		restart->nanosleep.compat_rmtp = rmtp;
@@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
 		return -EFAULT;
 
 	if (err == -ERESTART_RESTARTBLOCK) {
-		restart = &current_thread_info()->restart_block;
+		restart = &current->restart_block;
 		restart->fn = compat_clock_nanosleep_restart;
 		restart->nanosleep.compat_rmtp = rmtp;
 	}
diff --git a/kernel/futex.c b/kernel/futex.c
index 4eeb63de7e54..2a5e3830e953 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2217,7 +2217,7 @@ retry:
 	if (!abs_time)
 		goto out;
 
-	restart = &current_thread_info()->restart_block;
+	restart = &current->restart_block;
 	restart->fn = futex_wait_restart;
 	restart->futex.uaddr = uaddr;
 	restart->futex.val = val;
diff --git a/kernel/signal.c b/kernel/signal.c
index 16a305295256..33a52759cc0e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals);
  */
 SYSCALL_DEFINE0(restart_syscall)
 {
-	struct restart_block *restart = &current_thread_info()->restart_block;
+	struct restart_block *restart = &current->restart_block;
 	return restart->fn(restart);
 }
 
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index a7077d3ae52f..1b001ed1edb9 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -788,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 			goto out;
 	}
 
-	restart = &current_thread_info()->restart_block;
+	restart = &current->restart_block;
 	restart->fn = alarm_timer_nsleep_restart;
 	restart->nanosleep.clockid = type;
 	restart->nanosleep.expires = exp.tv64;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3f5e183c3d97..bee0c1f78091 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1583,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 			goto out;
 	}
 
-	restart = &current_thread_info()->restart_block;
+	restart = &current->restart_block;
 	restart->fn = hrtimer_nanosleep_restart;
 	restart->nanosleep.clockid = t.timer.base->clockid;
 	restart->nanosleep.rmtp = rmtp;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a16b67859e2a..0075da74abf0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1334,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
 static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 			    struct timespec *rqtp, struct timespec __user *rmtp)
 {
-	struct restart_block *restart_block =
-		&current_thread_info()->restart_block;
+	struct restart_block *restart_block = &current->restart_block;
 	struct itimerspec it;
 	int error;
 
-- 
cgit v1.2.3


From dd4a5c1e6531b9c6ba6ee1f3cb11f0aeea25881e Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 12 Feb 2015 15:01:22 -0800
Subject: linux/types.h: Always use unsigned long for pgoff_t

Everybody uses unsigned long for pgoff_t, and no one ever overrode the
definition of pgoff_t.  Keep it that way, and remove the option of
overriding it.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/types.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/types.h b/include/linux/types.h
index 62323825cff9..6747247e3f9f 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -135,12 +135,9 @@ typedef unsigned long blkcnt_t;
 #endif
 
 /*
- * The type of an index into the pagecache.  Use a #define so asm/types.h
- * can override it.
+ * The type of an index into the pagecache.
  */
-#ifndef pgoff_t
 #define pgoff_t unsigned long
-#endif
 
 /* A dma_addr_t can hold any valid DMA or bus address for the platform */
 #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
-- 
cgit v1.2.3


From 545a2bf742fb41f17d03486dd8a8c74ad511dec2 Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Thu, 12 Feb 2015 15:01:24 -0800
Subject: kernel/sched/clock.c: add another clock for use with the soft lockup
 watchdog

When the hypervisor pauses a virtualised kernel the kernel will observe a
jump in timebase, this can cause spurious messages from the softlockup
detector.

Whilst these messages are harmless, they are accompanied with a stack
trace which causes undue concern and more problematically the stack trace
in the guest has nothing to do with the observed problem and can only be
misleading.

Futhermore, on POWER8 this is completely avoidable with the introduction
of the Virtual Time Base (VTB) register.

This patch (of 2):

This permits the use of arch specific clocks for which virtualised kernels
can use their notion of 'running' time, not the elpased wall time which
will include host execution time.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Andrew Jones <drjones@redhat.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: chai wen <chaiw.fnst@cn.fujitsu.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: Ben Zhang <benzh@chromium.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  1 +
 kernel/sched/clock.c  | 13 +++++++++++++
 kernel/watchdog.c     |  2 +-
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 22ee0d5d7f8c..048b91b983ed 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2147,6 +2147,7 @@ extern unsigned long long notrace sched_clock(void);
  */
 extern u64 cpu_clock(int cpu);
 extern u64 local_clock(void);
+extern u64 running_clock(void);
 extern u64 sched_clock_cpu(int cpu);
 
 
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c27e4f8f4879..c0a205101c23 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -420,3 +420,16 @@ u64 local_clock(void)
 
 EXPORT_SYMBOL_GPL(cpu_clock);
 EXPORT_SYMBOL_GPL(local_clock);
+
+/*
+ * Running clock - returns the time that has elapsed while a guest has been
+ * running.
+ * On a guest this value should be local_clock minus the time the guest was
+ * suspended by the hypervisor (for any reason).
+ * On bare metal this function should return the same as local_clock.
+ * Architectures and sub-architectures can override this.
+ */
+u64 __weak running_clock(void)
+{
+	return local_clock();
+}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 70bf11815f84..3174bf8e3538 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -154,7 +154,7 @@ static int get_softlockup_thresh(void)
  */
 static unsigned long get_timestamp(void)
 {
-	return local_clock() >> 30LL;  /* 2^30 ~= 10^9 */
+	return running_clock() >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
 static void set_sample_period(void)
-- 
cgit v1.2.3


From 02f1f2170d2831b3233e91091c60a66622f29e82 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:01:31 -0800
Subject: kernel.h: remove ancient __FUNCTION__ hack

__FUNCTION__ hasn't been treated as a string literal since gcc 3.4, so
this only helps people who only test-compile using 3.3 (compiler-gcc3.h
barks at anything older than that).  Besides, there are almost no
occurrences of __FUNCTION__ left in the tree.

[akpm@linux-foundation.org: convert remaining __FUNCTION__ references]
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/hpet.c                       | 2 +-
 arch/x86/kernel/rtc.c                        | 4 ++--
 arch/x86/platform/intel-mid/intel_mid_vrtc.c | 2 +-
 drivers/acpi/acpica/utdebug.c                | 4 ++--
 drivers/block/xen-blkfront.c                 | 2 +-
 include/acpi/acoutput.h                      | 6 +++---
 include/linux/kernel.h                       | 3 ---
 7 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 319bcb9372fe..3acbff4716b0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -168,7 +168,7 @@ static void _hpet_print_config(const char *function, int line)
 #define hpet_print_config()					\
 do {								\
 	if (hpet_verbose)					\
-		_hpet_print_config(__FUNCTION__, __LINE__);	\
+		_hpet_print_config(__func__, __LINE__);	\
 } while (0)
 
 /*
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index fe3dbfe0c4a5..cd9685235df9 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -49,11 +49,11 @@ int mach_set_rtc_mmss(const struct timespec *now)
 		retval = set_rtc_time(&tm);
 		if (retval)
 			printk(KERN_ERR "%s: RTC write failed with error %d\n",
-			       __FUNCTION__, retval);
+			       __func__, retval);
 	} else {
 		printk(KERN_ERR
 		       "%s: Invalid RTC value: write of %lx to RTC failed\n",
-			__FUNCTION__, nowtime);
+			__func__, nowtime);
 		retval = -EINVAL;
 	}
 	return retval;
diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
index 4762cff7facd..32947ba0f62d 100644
--- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c
+++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
@@ -110,7 +110,7 @@ int vrtc_set_mmss(const struct timespec *now)
 		spin_unlock_irqrestore(&rtc_lock, flags);
 	} else {
 		pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n",
-			__FUNCTION__, now->tv_sec);
+			__func__, now->tv_sec);
 		retval = -EINVAL;
 	}
 	return retval;
diff --git a/drivers/acpi/acpica/utdebug.c b/drivers/acpi/acpica/utdebug.c
index 57078e3ea9b7..4f3f888d33bb 100644
--- a/drivers/acpi/acpica/utdebug.c
+++ b/drivers/acpi/acpica/utdebug.c
@@ -111,8 +111,8 @@ void acpi_ut_track_stack_ptr(void)
  * RETURN:      Updated pointer to the function name
  *
  * DESCRIPTION: Remove the "Acpi" prefix from the function name, if present.
- *              This allows compiler macros such as __FUNCTION__ to be used
- *              with no change to the debug output.
+ *              This allows compiler macros such as __func__ to be used with no
+ *              change to the debug output.
  *
  ******************************************************************************/
 
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2236c6f31608..d2cae5fc211a 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1391,7 +1391,7 @@ static int blkfront_probe(struct xenbus_device *dev,
 			if (major != XENVBD_MAJOR) {
 				printk(KERN_INFO
 						"%s: HVM does not support vbd %d as xen block device\n",
-						__FUNCTION__, vdevice);
+						__func__, vdevice);
 				return -ENODEV;
 			}
 		}
diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h
index 9318a87ee39a..a8f344363e77 100644
--- a/include/acpi/acoutput.h
+++ b/include/acpi/acoutput.h
@@ -240,7 +240,7 @@
 /*
  * If ACPI_GET_FUNCTION_NAME was not defined in the compiler-dependent header,
  * define it now. This is the case where there the compiler does not support
- * a __FUNCTION__ macro or equivalent.
+ * a __func__ macro or equivalent.
  */
 #ifndef ACPI_GET_FUNCTION_NAME
 #define ACPI_GET_FUNCTION_NAME          _acpi_function_name
@@ -249,12 +249,12 @@
  * The Name parameter should be the procedure name as a quoted string.
  * The function name is also used by the function exit macros below.
  * Note: (const char) is used to be compatible with the debug interfaces
- * and macros such as __FUNCTION__.
+ * and macros such as __func__.
  */
 #define ACPI_FUNCTION_NAME(name)        static const char _acpi_function_name[] = #name;
 
 #else
-/* Compiler supports __FUNCTION__ (or equivalent) -- Ignore this macro */
+/* Compiler supports __func__ (or equivalent) -- Ignore this macro */
 
 #define ACPI_FUNCTION_NAME(name)
 #endif				/* ACPI_GET_FUNCTION_NAME */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e42e7dc34c68..d6d630d31ef3 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -800,9 +800,6 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
 	(type *)( (char *)__mptr - offsetof(type,member) );})
 
-/* Trap pasters of __FUNCTION__ at compile-time */
-#define __FUNCTION__ (__func__)
-
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
-- 
cgit v1.2.3


From d1214c65c02d503330ce86bd38e344a36599e055 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:01:50 -0800
Subject: libstring_helpers.c:string_get_size(): return void

string_get_size() was documented to return an error, but in fact always
returned 0.  Since the output always fits in 9 bytes, just document that
and let callers do what they do now: pass a small stack buffer and ignore
the return value.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string_helpers.h |  4 ++--
 lib/string_helpers.c           | 10 ++++------
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 6eb567ac56bc..657571817260 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -10,8 +10,8 @@ enum string_size_units {
 	STRING_UNITS_2,		/* use binary powers of 2^10 */
 };
 
-int string_get_size(u64 size, enum string_size_units units,
-		    char *buf, int len);
+void string_get_size(u64 size, enum string_size_units units,
+		     char *buf, int len);
 
 #define UNESCAPE_SPACE		0x01
 #define UNESCAPE_OCTAL		0x02
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 2b3757f84b3b..8f8c4417f228 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -20,12 +20,12 @@
  * @len:	length of buffer
  *
  * This function returns a string formatted to 3 significant figures
- * giving the size in the required units.  Returns 0 on success or
- * error on failure.  @buf is always zero terminated.
+ * giving the size in the required units.  @buf should have room for
+ * at least 9 bytes and will always be zero terminated.
  *
  */
-int string_get_size(u64 size, const enum string_size_units units,
-		    char *buf, int len)
+void string_get_size(u64 size, const enum string_size_units units,
+		     char *buf, int len)
 {
 	static const char *const units_10[] = {
 		"B", "kB", "MB", "GB", "TB", "PB", "EB"
@@ -67,8 +67,6 @@ int string_get_size(u64 size, const enum string_size_units units,
 
 	snprintf(buf, len, "%u%s %s", (u32)size,
 		 tmp, units_str[units][i]);
-
-	return 0;
 }
 EXPORT_SYMBOL(string_get_size);
 
-- 
cgit v1.2.3


From 8b4daad52fee7731ea4bae22a99ece2d4ba7ba43 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:01:53 -0800
Subject: lib/bitmap.c: more signed->unsigned conversions

For consistency with the other bitmap_* functions, also make the nbits
parameter of bitmap_zero, bitmap_fill and bitmap_copy unsigned.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 202e4034fe26..1406d5453781 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -185,33 +185,33 @@ extern int bitmap_print_to_pagebuf(bool list, char *buf,
 #define small_const_nbits(nbits) \
 	(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
 
-static inline void bitmap_zero(unsigned long *dst, int nbits)
+static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		*dst = 0UL;
 	else {
-		int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+		unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
 		memset(dst, 0, len);
 	}
 }
 
-static inline void bitmap_fill(unsigned long *dst, int nbits)
+static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 {
-	size_t nlongs = BITS_TO_LONGS(nbits);
+	unsigned int nlongs = BITS_TO_LONGS(nbits);
 	if (!small_const_nbits(nbits)) {
-		int len = (nlongs - 1) * sizeof(unsigned long);
+		unsigned int len = (nlongs - 1) * sizeof(unsigned long);
 		memset(dst, 0xff,  len);
 	}
 	dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits);
 }
 
 static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
-			int nbits)
+			unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		*dst = *src;
 	else {
-		int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+		unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
 		memcpy(dst, src, len);
 	}
 }
-- 
cgit v1.2.3


From 33c4fa8c6763f1ba9f4ea64079882eaa6d7957b7 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:01:56 -0800
Subject: linux/nodemask.h: update bitmap wrappers to take unsigned int

Since the various bitmap_* functions now take an unsigned int as nbits
parameter, it makes sense to also update the various wrappers.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nodemask.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 83a6aeda899d..21cef483dc1b 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -120,13 +120,13 @@ static inline void __node_clear(int node, volatile nodemask_t *dstp)
 }
 
 #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
-static inline void __nodes_setall(nodemask_t *dstp, int nbits)
+static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
 {
 	bitmap_fill(dstp->bits, nbits);
 }
 
 #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
-static inline void __nodes_clear(nodemask_t *dstp, int nbits)
+static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
 {
 	bitmap_zero(dstp->bits, nbits);
 }
@@ -144,7 +144,7 @@ static inline int __node_test_and_set(int node, nodemask_t *addr)
 #define nodes_and(dst, src1, src2) \
 			__nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
 static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
@@ -152,7 +152,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
 #define nodes_or(dst, src1, src2) \
 			__nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
 static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
@@ -160,7 +160,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
 #define nodes_xor(dst, src1, src2) \
 			__nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
 static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
@@ -168,7 +168,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
 #define nodes_andnot(dst, src1, src2) \
 			__nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
 static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
@@ -176,7 +176,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
 #define nodes_complement(dst, src) \
 			__nodes_complement(&(dst), &(src), MAX_NUMNODES)
 static inline void __nodes_complement(nodemask_t *dstp,
-					const nodemask_t *srcp, int nbits)
+					const nodemask_t *srcp, unsigned int nbits)
 {
 	bitmap_complement(dstp->bits, srcp->bits, nbits);
 }
@@ -184,7 +184,7 @@ static inline void __nodes_complement(nodemask_t *dstp,
 #define nodes_equal(src1, src2) \
 			__nodes_equal(&(src1), &(src2), MAX_NUMNODES)
 static inline int __nodes_equal(const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	return bitmap_equal(src1p->bits, src2p->bits, nbits);
 }
@@ -192,7 +192,7 @@ static inline int __nodes_equal(const nodemask_t *src1p,
 #define nodes_intersects(src1, src2) \
 			__nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
 static inline int __nodes_intersects(const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
 }
@@ -200,25 +200,25 @@ static inline int __nodes_intersects(const nodemask_t *src1p,
 #define nodes_subset(src1, src2) \
 			__nodes_subset(&(src1), &(src2), MAX_NUMNODES)
 static inline int __nodes_subset(const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	return bitmap_subset(src1p->bits, src2p->bits, nbits);
 }
 
 #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
-static inline int __nodes_empty(const nodemask_t *srcp, int nbits)
+static inline int __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
 {
 	return bitmap_empty(srcp->bits, nbits);
 }
 
 #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
-static inline int __nodes_full(const nodemask_t *srcp, int nbits)
+static inline int __nodes_full(const nodemask_t *srcp, unsigned int nbits)
 {
 	return bitmap_full(srcp->bits, nbits);
 }
 
 #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
-static inline int __nodes_weight(const nodemask_t *srcp, int nbits)
+static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
 {
 	return bitmap_weight(srcp->bits, nbits);
 }
-- 
cgit v1.2.3


From f5ac1f55204fec9d9c63644bc1de0ab6a59af9f1 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:01:59 -0800
Subject: linux/cpumask.h: update bitmap wrappers to take unsigned int

Since the various bitmap_* functions now take an unsigned int as nbits
parameter, it makes sense to also update the various wrappers, even though
they're marked as obsolete.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index b950e9d6008b..ff9044286d88 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -905,13 +905,13 @@ static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp)
 }
 
 #define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS)
-static inline void __cpus_setall(cpumask_t *dstp, int nbits)
+static inline void __cpus_setall(cpumask_t *dstp, unsigned int nbits)
 {
 	bitmap_fill(dstp->bits, nbits);
 }
 
 #define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS)
-static inline void __cpus_clear(cpumask_t *dstp, int nbits)
+static inline void __cpus_clear(cpumask_t *dstp, unsigned int nbits)
 {
 	bitmap_zero(dstp->bits, nbits);
 }
@@ -927,21 +927,21 @@ static inline int __cpu_test_and_set(int cpu, cpumask_t *addr)
 
 #define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS)
 static inline int __cpus_and(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
 
 #define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS)
 static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
 
 #define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS)
 static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
@@ -949,40 +949,40 @@ static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p,
 #define cpus_andnot(dst, src1, src2) \
 				__cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS)
 static inline int __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
 
 #define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS)
 static inline int __cpus_equal(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	return bitmap_equal(src1p->bits, src2p->bits, nbits);
 }
 
 #define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS)
 static inline int __cpus_intersects(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
 }
 
 #define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS)
 static inline int __cpus_subset(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	return bitmap_subset(src1p->bits, src2p->bits, nbits);
 }
 
 #define cpus_empty(src) __cpus_empty(&(src), NR_CPUS)
-static inline int __cpus_empty(const cpumask_t *srcp, int nbits)
+static inline int __cpus_empty(const cpumask_t *srcp, unsigned int nbits)
 {
 	return bitmap_empty(srcp->bits, nbits);
 }
 
 #define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS)
-static inline int __cpus_weight(const cpumask_t *srcp, int nbits)
+static inline int __cpus_weight(const cpumask_t *srcp, unsigned int nbits)
 {
 	return bitmap_weight(srcp->bits, nbits);
 }
-- 
cgit v1.2.3


From eb5698837881687841c9e477e4162ac3387c6b59 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:02:01 -0800
Subject: lib/bitmap.c: update bitmap_onto to unsigned

Change the nbits parameter of bitmap_onto to unsigned int for consistency
with other bitmap_* functions.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 2 +-
 lib/bitmap.c           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 1406d5453781..d0c6214eb190 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -164,7 +164,7 @@ extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
 extern int bitmap_bitremap(int oldbit,
 		const unsigned long *old, const unsigned long *new, int bits);
 extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
-		const unsigned long *relmap, int bits);
+		const unsigned long *relmap, unsigned int bits);
 extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 		int sz, int bits);
 extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 324ea9eab8c1..ee598a496895 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -1006,9 +1006,9 @@ EXPORT_SYMBOL(bitmap_bitremap);
  * All bits in @dst not set by the above rule are cleared.
  */
 void bitmap_onto(unsigned long *dst, const unsigned long *orig,
-			const unsigned long *relmap, int bits)
+			const unsigned long *relmap, unsigned int bits)
 {
-	int n, m;       	/* same meaning as in above comment */
+	unsigned int n, m;	/* same meaning as in above comment */
 
 	if (dst == orig)	/* following doesn't handle inplace mappings */
 		return;
-- 
cgit v1.2.3


From b26ad5836c3a0a9d456eb60b9f841ca15403ee59 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:02:04 -0800
Subject: lib/bitmap.c: change parameters of bitmap_fold to unsigned

Change the sz and nbits parameters of bitmap_fold to unsigned int for
consistency with other bitmap_* functions, and to save another few bytes
in the generated code.

[akpm@linux-foundation.org: fix kerneldoc]
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h |  2 +-
 lib/bitmap.c           | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index d0c6214eb190..95dcd2f76e1a 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -166,7 +166,7 @@ extern int bitmap_bitremap(int oldbit,
 extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
 		const unsigned long *relmap, unsigned int bits);
 extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
-		int sz, int bits);
+		unsigned int sz, unsigned int nbits);
 extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
diff --git a/lib/bitmap.c b/lib/bitmap.c
index ee598a496895..b0d3e823dab3 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -1039,22 +1039,22 @@ EXPORT_SYMBOL(bitmap_onto);
  *	@dst: resulting smaller bitmap
  *	@orig: original larger bitmap
  *	@sz: specified size
- *	@bits: number of bits in each of these bitmaps
+ *	@nbits: number of bits in each of these bitmaps
  *
  * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
  * Clear all other bits in @dst.  See further the comment and
  * Example [2] for bitmap_onto() for why and how to use this.
  */
 void bitmap_fold(unsigned long *dst, const unsigned long *orig,
-			int sz, int bits)
+			unsigned int sz, unsigned int nbits)
 {
-	int oldbit;
+	unsigned int oldbit;
 
 	if (dst == orig)	/* following doesn't handle inplace mappings */
 		return;
-	bitmap_zero(dst, bits);
+	bitmap_zero(dst, nbits);
 
-	for_each_set_bit(oldbit, orig, bits)
+	for_each_set_bit(oldbit, orig, nbits)
 		set_bit(oldbit % sz, dst);
 }
 EXPORT_SYMBOL(bitmap_fold);
-- 
cgit v1.2.3


From f6a1f5db8d7a7a94ff07251996959d27daba4ee7 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:02:10 -0800
Subject: lib/bitmap.c: simplify bitmap_ord_to_pos

Make the return value and the ord and nbits parameters of
bitmap_ord_to_pos unsigned.

Also, simplify the implementation and as a side effect make the result
fully defined, returning nbits for ord >= weight, in analogy with what
find_{first,next}_bit does.  This is a better sentinel than the former
("unofficial") 0.  No current users are affected by this change.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h |  2 +-
 lib/bitmap.c           | 28 +++++++++++-----------------
 2 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 95dcd2f76e1a..1e74fe7aa167 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -171,7 +171,7 @@ extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int
 extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
 extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
-extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
+extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
 extern int bitmap_print_to_pagebuf(bool list, char *buf,
 				   const unsigned long *maskp, int nmaskbits);
 
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 84d20b5c6bf1..e8a38bde7af9 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -771,34 +771,28 @@ static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigne
  * bitmap_ord_to_pos - find position of n-th set bit in bitmap
  *	@buf: pointer to bitmap
  *	@ord: ordinal bit position (n-th set bit, n >= 0)
- *	@bits: number of valid bit positions in @buf
+ *	@nbits: number of valid bit positions in @buf
  *
  * Map the ordinal offset of bit @ord in @buf to its position in @buf.
- * Value of @ord should be in range 0 <= @ord < weight(buf), else
- * results are undefined.
+ * Value of @ord should be in range 0 <= @ord < weight(buf). If @ord
+ * >= weight(buf), returns @nbits.
  *
  * If for example, just bits 4 through 7 are set in @buf, then @ord
  * values 0 through 3 will get mapped to 4 through 7, respectively,
- * and all other @ord values return undefined values.  When @ord value 3
+ * and all other @ord values returns @nbits.  When @ord value 3
  * gets mapped to (returns) @pos value 7 in this example, that means
  * that the 3rd set bit (starting with 0th) is at position 7 in @buf.
  *
- * The bit positions 0 through @bits are valid positions in @buf.
+ * The bit positions 0 through @nbits-1 are valid positions in @buf.
  */
-int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
+unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsigned int nbits)
 {
-	int pos = 0;
+	unsigned int pos;
 
-	if (ord >= 0 && ord < bits) {
-		int i;
-
-		for (i = find_first_bit(buf, bits);
-		     i < bits && ord > 0;
-		     i = find_next_bit(buf, bits, i + 1))
-	     		ord--;
-		if (i < bits && ord == 0)
-			pos = i;
-	}
+	for (pos = find_first_bit(buf, nbits);
+	     pos < nbits && ord;
+	     pos = find_next_bit(buf, nbits, pos + 1))
+		ord--;
 
 	return pos;
 }
-- 
cgit v1.2.3


From 9814ec135dedb70a9daa41e68798d540d7ba71f2 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:02:13 -0800
Subject: lib/bitmap.c: make the bits parameter of bitmap_remap unsigned

Also, rename bits to nbits. Both changes for consistency with other
bitmap_* functions.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h |  2 +-
 lib/bitmap.c           | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 1e74fe7aa167..5f5c00de39f0 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -160,7 +160,7 @@ extern int bitmap_parselist(const char *buf, unsigned long *maskp,
 extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
 			unsigned long *dst, int nbits);
 extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
-		const unsigned long *old, const unsigned long *new, int bits);
+		const unsigned long *old, const unsigned long *new, unsigned int nbits);
 extern int bitmap_bitremap(int oldbit,
 		const unsigned long *old, const unsigned long *new, int bits);
 extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
diff --git a/lib/bitmap.c b/lib/bitmap.c
index e8a38bde7af9..ad161a6c82db 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -803,7 +803,7 @@ unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsig
  *	@src: subset to be remapped
  *	@old: defines domain of map
  *	@new: defines range of map
- *	@bits: number of bits in each of these bitmaps
+ *	@nbits: number of bits in each of these bitmaps
  *
  * Let @old and @new define a mapping of bit positions, such that
  * whatever position is held by the n-th set bit in @old is mapped
@@ -831,22 +831,22 @@ unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsig
  */
 void bitmap_remap(unsigned long *dst, const unsigned long *src,
 		const unsigned long *old, const unsigned long *new,
-		int bits)
+		unsigned int nbits)
 {
-	int oldbit, w;
+	unsigned int oldbit, w;
 
 	if (dst == src)		/* following doesn't handle inplace remaps */
 		return;
-	bitmap_zero(dst, bits);
+	bitmap_zero(dst, nbits);
 
-	w = bitmap_weight(new, bits);
-	for_each_set_bit(oldbit, src, bits) {
-	     	int n = bitmap_pos_to_ord(old, oldbit, bits);
+	w = bitmap_weight(new, nbits);
+	for_each_set_bit(oldbit, src, nbits) {
+		int n = bitmap_pos_to_ord(old, oldbit, nbits);
 
 		if (n < 0 || w == 0)
 			set_bit(oldbit, dst);	/* identity map */
 		else
-			set_bit(bitmap_ord_to_pos(new, n % w, bits), dst);
+			set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst);
 	}
 }
 EXPORT_SYMBOL(bitmap_remap);
-- 
cgit v1.2.3


From af3cd13501eb04ca61d017ff4406f1cbffafdc04 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:02:15 -0800
Subject: lib/string.c: remove strnicmp()

Now that all in-tree users of strnicmp have been converted to
strncasecmp, the wrapper can be removed.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: David Howells <dhowells@redhat.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/include/asm/string.h  | 1 -
 arch/s390/include/asm/string.h | 1 -
 include/linux/string.h         | 3 ---
 lib/string.c                   | 8 --------
 4 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/frv/include/asm/string.h b/arch/frv/include/asm/string.h
index 5ed310f64b7e..1f6c35990439 100644
--- a/arch/frv/include/asm/string.h
+++ b/arch/frv/include/asm/string.h
@@ -33,7 +33,6 @@ extern void *memcpy(void *, const void *, __kernel_size_t);
 #define __HAVE_ARCH_STRNCAT 1
 #define __HAVE_ARCH_STRCMP 1
 #define __HAVE_ARCH_STRNCMP 1
-#define __HAVE_ARCH_STRNICMP 1
 #define __HAVE_ARCH_STRCHR 1
 #define __HAVE_ARCH_STRRCHR 1
 #define __HAVE_ARCH_STRSTR 1
diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h
index 7e2dcd7c57ef..8662f5c8e17f 100644
--- a/arch/s390/include/asm/string.h
+++ b/arch/s390/include/asm/string.h
@@ -44,7 +44,6 @@ extern char *strstr(const char *, const char *);
 #undef __HAVE_ARCH_STRCHR
 #undef __HAVE_ARCH_STRNCHR
 #undef __HAVE_ARCH_STRNCMP
-#undef __HAVE_ARCH_STRNICMP
 #undef __HAVE_ARCH_STRPBRK
 #undef __HAVE_ARCH_STRSEP
 #undef __HAVE_ARCH_STRSPN
diff --git a/include/linux/string.h b/include/linux/string.h
index 2e22a2e58f3a..b9bc9a5d9e21 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -40,9 +40,6 @@ extern int strcmp(const char *,const char *);
 #ifndef __HAVE_ARCH_STRNCMP
 extern int strncmp(const char *,const char *,__kernel_size_t);
 #endif
-#ifndef __HAVE_ARCH_STRNICMP
-#define strnicmp strncasecmp
-#endif
 #ifndef __HAVE_ARCH_STRCASECMP
 extern int strcasecmp(const char *s1, const char *s2);
 #endif
diff --git a/lib/string.c b/lib/string.c
index 10063300b830..3206d0178296 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -58,14 +58,6 @@ int strncasecmp(const char *s1, const char *s2, size_t len)
 }
 EXPORT_SYMBOL(strncasecmp);
 #endif
-#ifndef __HAVE_ARCH_STRNICMP
-#undef strnicmp
-int strnicmp(const char *s1, const char *s2, size_t len)
-{
-	return strncasecmp(s1, s2, len);
-}
-EXPORT_SYMBOL(strnicmp);
-#endif
 
 #ifndef __HAVE_ARCH_STRCASECMP
 int strcasecmp(const char *s1, const char *s2)
-- 
cgit v1.2.3


From 114fc1afb2de7dec40da137dc2a55cd38fc220f2 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 12 Feb 2015 15:02:29 -0800
Subject: hexdump: make it return number of bytes placed in buffer

This patch makes hexdump return the number of bytes placed in the buffer
excluding trailing NUL.  In the case of overflow it returns the desired
amount of bytes to produce the entire dump.  Thus, it mimics snprintf().

This will be useful for users that would like to repeat with a bigger
buffer.

[akpm@linux-foundation.org: fix printk warning]
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h |  6 ++---
 lib/hexdump.c          | 73 +++++++++++++++++++++++++++++++++++++-------------
 lib/test-hexdump.c     | 45 +++++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 4d5bf5726578..baa3f97d8ce8 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -417,9 +417,9 @@ enum {
 	DUMP_PREFIX_ADDRESS,
 	DUMP_PREFIX_OFFSET
 };
-extern void hex_dump_to_buffer(const void *buf, size_t len,
-			       int rowsize, int groupsize,
-			       char *linebuf, size_t linebuflen, bool ascii);
+extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
+			      int groupsize, char *linebuf, size_t linebuflen,
+			      bool ascii);
 #ifdef CONFIG_PRINTK
 extern void print_hex_dump(const char *level, const char *prefix_str,
 			   int prefix_type, int rowsize, int groupsize,
diff --git a/lib/hexdump.c b/lib/hexdump.c
index 4af53f73c7cc..7ea09699855d 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -97,22 +97,26 @@ EXPORT_SYMBOL(bin2hex);
  *
  * example output buffer:
  * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f  @ABCDEFGHIJKLMNO
+ *
+ * Return:
+ * The amount of bytes placed in the buffer without terminating NUL. If the
+ * output was truncated, then the return value is the number of bytes
+ * (excluding the terminating NUL) which would have been written to the final
+ * string if enough space had been available.
  */
-void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
-			int groupsize, char *linebuf, size_t linebuflen,
-			bool ascii)
+int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize,
+		       char *linebuf, size_t linebuflen, bool ascii)
 {
 	const u8 *ptr = buf;
 	int ngroups;
 	u8 ch;
 	int j, lx = 0;
 	int ascii_column;
+	int ret;
 
 	if (rowsize != 16 && rowsize != 32)
 		rowsize = 16;
 
-	if (!len)
-		goto nil;
 	if (len > rowsize)		/* limit to one line at a time */
 		len = rowsize;
 	if (!is_power_of_2(groupsize) || groupsize > 8)
@@ -122,27 +126,50 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
 
 	ngroups = len / groupsize;
 	ascii_column = rowsize * 2 + rowsize / groupsize + 1;
+
+	if (!linebuflen)
+		goto overflow1;
+
+	if (!len)
+		goto nil;
+
 	if (groupsize == 8) {
 		const u64 *ptr8 = buf;
 
-		for (j = 0; j < ngroups; j++)
-			lx += scnprintf(linebuf + lx, linebuflen - lx,
-					"%s%16.16llx", j ? " " : "",
-					(unsigned long long)*(ptr8 + j));
+		for (j = 0; j < ngroups; j++) {
+			ret = snprintf(linebuf + lx, linebuflen - lx,
+				       "%s%16.16llx", j ? " " : "",
+				       (unsigned long long)*(ptr8 + j));
+			if (ret >= linebuflen - lx)
+				goto overflow1;
+			lx += ret;
+		}
 	} else if (groupsize == 4) {
 		const u32 *ptr4 = buf;
 
-		for (j = 0; j < ngroups; j++)
-			lx += scnprintf(linebuf + lx, linebuflen - lx,
-					"%s%8.8x", j ? " " : "", *(ptr4 + j));
+		for (j = 0; j < ngroups; j++) {
+			ret = snprintf(linebuf + lx, linebuflen - lx,
+				       "%s%8.8x", j ? " " : "",
+				       *(ptr4 + j));
+			if (ret >= linebuflen - lx)
+				goto overflow1;
+			lx += ret;
+		}
 	} else if (groupsize == 2) {
 		const u16 *ptr2 = buf;
 
-		for (j = 0; j < ngroups; j++)
-			lx += scnprintf(linebuf + lx, linebuflen - lx,
-					"%s%4.4x", j ? " " : "", *(ptr2 + j));
+		for (j = 0; j < ngroups; j++) {
+			ret = snprintf(linebuf + lx, linebuflen - lx,
+				       "%s%4.4x", j ? " " : "",
+				       *(ptr2 + j));
+			if (ret >= linebuflen - lx)
+				goto overflow1;
+			lx += ret;
+		}
 	} else {
-		for (j = 0; (j < len) && (lx + 3) <= linebuflen; j++) {
+		for (j = 0; j < len; j++) {
+			if (linebuflen < lx + 3)
+				goto overflow2;
 			ch = ptr[j];
 			linebuf[lx++] = hex_asc_hi(ch);
 			linebuf[lx++] = hex_asc_lo(ch);
@@ -154,14 +181,24 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
 	if (!ascii)
 		goto nil;
 
-	while (lx < (linebuflen - 1) && lx < ascii_column)
+	while (lx < ascii_column) {
+		if (linebuflen < lx + 2)
+			goto overflow2;
 		linebuf[lx++] = ' ';
-	for (j = 0; (j < len) && (lx + 2) < linebuflen; j++) {
+	}
+	for (j = 0; j < len; j++) {
+		if (linebuflen < lx + 2)
+			goto overflow2;
 		ch = ptr[j];
 		linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.';
 	}
 nil:
+	linebuf[lx] = '\0';
+	return lx;
+overflow2:
 	linebuf[lx++] = '\0';
+overflow1:
+	return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1;
 }
 EXPORT_SYMBOL(hex_dump_to_buffer);
 
diff --git a/lib/test-hexdump.c b/lib/test-hexdump.c
index 9d3bd1e9ae48..daf29a390a89 100644
--- a/lib/test-hexdump.c
+++ b/lib/test-hexdump.c
@@ -114,6 +114,45 @@ static void __init test_hexdump_set(int rowsize, bool ascii)
 	test_hexdump(len, rowsize, 1, ascii);
 }
 
+static void __init test_hexdump_overflow(bool ascii)
+{
+	char buf[56];
+	const char *t = test_data_1_le[0];
+	size_t l = get_random_int() % sizeof(buf);
+	bool a;
+	int e, r;
+
+	memset(buf, ' ', sizeof(buf));
+
+	r = hex_dump_to_buffer(data_b, 1, 16, 1, buf, l, ascii);
+
+	if (ascii)
+		e = 50;
+	else
+		e = 2;
+	buf[e + 2] = '\0';
+
+	if (!l) {
+		a = r == e && buf[0] == ' ';
+	} else if (l < 3) {
+		a = r == e && buf[0] == '\0';
+	} else if (l < 4) {
+		a = r == e && !strcmp(buf, t);
+	} else if (ascii) {
+		if (l < 51)
+			a = r == e && buf[l - 1] == '\0' && buf[l - 2] == ' ';
+		else
+			a = r == e && buf[50] == '\0' && buf[49] == '.';
+	} else {
+		a = r == e && buf[e] == '\0';
+	}
+
+	if (!a) {
+		pr_err("Len: %zu rc: %u strlen: %zu\n", l, r, strlen(buf));
+		pr_err("Result: '%s'\n", buf);
+	}
+}
+
 static int __init test_hexdump_init(void)
 {
 	unsigned int i;
@@ -129,6 +168,12 @@ static int __init test_hexdump_init(void)
 	for (i = 0; i < 16; i++)
 		test_hexdump_set(rowsize, true);
 
+	for (i = 0; i < 16; i++)
+		test_hexdump_overflow(false);
+
+	for (i = 0; i < 16; i++)
+		test_hexdump_overflow(true);
+
 	return -EINVAL;
 }
 module_init(test_hexdump_init);
-- 
cgit v1.2.3


From 3248340d3f10871d2ae2d1df9e323f284fa1fdb6 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:02:40 -0800
Subject: lib/halfmd4.c: simplify includes

We only need EXPORT_SYMBOL, so compiler.h and export.h suffice.  This
means linux/types.h is no longer implicitly included, so add an include of
uapi/linux/types.h to linux/cryptohash.h for __u32.  Other users of
cryptohash.h cannot be affected, since they must already have been
including uapi/linux/types.h in order for gcc not to complain about
unknown types.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cryptohash.h | 2 ++
 lib/halfmd4.c              | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index 2cd9f1cf9fa3..f4754282c9c2 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -1,6 +1,8 @@
 #ifndef __CRYPTOHASH_H
 #define __CRYPTOHASH_H
 
+#include <uapi/linux/types.h>
+
 #define SHA_DIGEST_WORDS 5
 #define SHA_MESSAGE_BYTES (512 /*bits*/ / 8)
 #define SHA_WORKSPACE_WORDS 16
diff --git a/lib/halfmd4.c b/lib/halfmd4.c
index 66d0ee8b7776..a8fe6274a13c 100644
--- a/lib/halfmd4.c
+++ b/lib/halfmd4.c
@@ -1,4 +1,4 @@
-#include <linux/kernel.h>
+#include <linux/compiler.h>
 #include <linux/export.h>
 #include <linux/cryptohash.h>
 
-- 
cgit v1.2.3


From 3810631332465d967ba5e27ea2c7dff2c9afac6c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 12 Feb 2015 23:33:15 +0100
Subject: PM / sleep: Re-implement suspend-to-idle handling

In preparation for adding support for quiescing timers in the final
stage of suspend-to-idle transitions, rework the freeze_enter()
function making the system wait on a wakeup event, the freeze_wake()
function terminating the suspend-to-idle loop and the mechanism by
which deep idle states are entered during suspend-to-idle.

First of all, introduce a simple state machine for suspend-to-idle
and make the code in question use it.

Second, prevent freeze_enter() from losing wakeup events due to race
conditions and ensure that the number of online CPUs won't change
while it is being executed.  In addition to that, make it force
all of the CPUs re-enter the idle loop in case they are in idle
states already (so they can enter deeper idle states if possible).

Next, drop cpuidle_use_deepest_state() and replace use_deepest_state
checks in cpuidle_select() and cpuidle_reflect() with a single
suspend-to-idle state check in cpuidle_idle_call().

Finally, introduce cpuidle_enter_freeze() that will simply find the
deepest idle state available to the given CPU and enter it using
cpuidle_enter().

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/cpuidle/cpuidle.c | 49 +++++++++++++++++++++++++----------------------
 include/linux/cpuidle.h   |  4 ++--
 include/linux/suspend.h   | 16 ++++++++++++++++
 kernel/power/suspend.c    | 43 ++++++++++++++++++++++++++++++++++-------
 kernel/sched/idle.c       | 16 ++++++++++++++++
 5 files changed, 96 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 125150dc6e81..23a8d6cc8d30 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -19,6 +19,7 @@
 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
 #include <linux/module.h>
+#include <linux/suspend.h>
 #include <trace/events/power.h>
 
 #include "cpuidle.h"
@@ -32,7 +33,6 @@ LIST_HEAD(cpuidle_detected_devices);
 static int enabled_devices;
 static int off __read_mostly;
 static int initialized __read_mostly;
-static bool use_deepest_state __read_mostly;
 
 int cpuidle_disabled(void)
 {
@@ -66,24 +66,9 @@ int cpuidle_play_dead(void)
 }
 
 /**
- * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode.
- * @enable: Whether enable or disable the feature.
- *
- * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and
- * always use the state with the greatest exit latency (out of the states that
- * are not disabled).
- *
- * This function can only be called after cpuidle_pause() to avoid races.
- */
-void cpuidle_use_deepest_state(bool enable)
-{
-	use_deepest_state = enable;
-}
-
-/**
- * cpuidle_find_deepest_state - Find the state of the greatest exit latency.
- * @drv: cpuidle driver for a given CPU.
- * @dev: cpuidle device for a given CPU.
+ * cpuidle_find_deepest_state - Find deepest state meeting specific conditions.
+ * @drv: cpuidle driver for the given CPU.
+ * @dev: cpuidle device for the given CPU.
  */
 static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 				      struct cpuidle_device *dev)
@@ -104,6 +89,27 @@ static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 	return ret;
 }
 
+/**
+ * cpuidle_enter_freeze - Enter an idle state suitable for suspend-to-idle.
+ *
+ * Find the deepest state available and enter it.
+ */
+void cpuidle_enter_freeze(void)
+{
+	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+	int index;
+
+	index = cpuidle_find_deepest_state(drv, dev);
+	if (index >= 0)
+		cpuidle_enter(drv, dev, index);
+	else
+		arch_cpu_idle();
+
+	/* Interrupts are enabled again here. */
+	local_irq_disable();
+}
+
 /**
  * cpuidle_enter_state - enter the state and update stats
  * @dev: cpuidle device for this cpu
@@ -166,9 +172,6 @@ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	if (!drv || !dev || !dev->enabled)
 		return -EBUSY;
 
-	if (unlikely(use_deepest_state))
-		return cpuidle_find_deepest_state(drv, dev);
-
 	return cpuidle_curr_governor->select(drv, dev);
 }
 
@@ -200,7 +203,7 @@ int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev,
  */
 void cpuidle_reflect(struct cpuidle_device *dev, int index)
 {
-	if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state))
+	if (cpuidle_curr_governor->reflect)
 		cpuidle_curr_governor->reflect(dev, index);
 }
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index ab70f3bc44ad..f63aabf4ee90 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -141,7 +141,7 @@ extern void cpuidle_resume(void);
 extern int cpuidle_enable_device(struct cpuidle_device *dev);
 extern void cpuidle_disable_device(struct cpuidle_device *dev);
 extern int cpuidle_play_dead(void);
-extern void cpuidle_use_deepest_state(bool enable);
+extern void cpuidle_enter_freeze(void);
 
 extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
 #else
@@ -174,7 +174,7 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev)
 {return -ENODEV; }
 static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
 static inline int cpuidle_play_dead(void) {return -ENODEV; }
-static inline void cpuidle_use_deepest_state(bool enable) {}
+static inline void cpuidle_enter_freeze(void) { }
 static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
 	struct cpuidle_device *dev) {return NULL; }
 #endif
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 3388c1b6f7d8..5efe743ce1e8 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -201,6 +201,21 @@ struct platform_freeze_ops {
  */
 extern void suspend_set_ops(const struct platform_suspend_ops *ops);
 extern int suspend_valid_only_mem(suspend_state_t state);
+
+/* Suspend-to-idle state machnine. */
+enum freeze_state {
+	FREEZE_STATE_NONE,      /* Not suspended/suspending. */
+	FREEZE_STATE_ENTER,     /* Enter suspend-to-idle. */
+	FREEZE_STATE_WAKE,      /* Wake up from suspend-to-idle. */
+};
+
+extern enum freeze_state __read_mostly suspend_freeze_state;
+
+static inline bool idle_should_freeze(void)
+{
+	return unlikely(suspend_freeze_state == FREEZE_STATE_ENTER);
+}
+
 extern void freeze_set_ops(const struct platform_freeze_ops *ops);
 extern void freeze_wake(void);
 
@@ -228,6 +243,7 @@ extern int pm_suspend(suspend_state_t state);
 
 static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {}
 static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
+static inline bool idle_should_freeze(void) { return false; }
 static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {}
 static inline void freeze_wake(void) {}
 #endif /* !CONFIG_SUSPEND */
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c347e3ce3a55..b7d6b3a721b1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,7 +37,9 @@ const char *pm_states[PM_SUSPEND_MAX];
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_freeze_ops *freeze_ops;
 static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+
+enum freeze_state __read_mostly suspend_freeze_state;
+static DEFINE_SPINLOCK(suspend_freeze_lock);
 
 void freeze_set_ops(const struct platform_freeze_ops *ops)
 {
@@ -48,22 +50,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
 
 static void freeze_begin(void)
 {
-	suspend_freeze_wake = false;
+	suspend_freeze_state = FREEZE_STATE_NONE;
 }
 
 static void freeze_enter(void)
 {
-	cpuidle_use_deepest_state(true);
+	spin_lock_irq(&suspend_freeze_lock);
+	if (pm_wakeup_pending())
+		goto out;
+
+	suspend_freeze_state = FREEZE_STATE_ENTER;
+	spin_unlock_irq(&suspend_freeze_lock);
+
+	get_online_cpus();
 	cpuidle_resume();
-	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+
+	/* Push all the CPUs into the idle loop. */
+	wake_up_all_idle_cpus();
+	pr_debug("PM: suspend-to-idle\n");
+	/* Make the current CPU wait so it can enter the idle loop too. */
+	wait_event(suspend_freeze_wait_head,
+		   suspend_freeze_state == FREEZE_STATE_WAKE);
+	pr_debug("PM: resume from suspend-to-idle\n");
+
 	cpuidle_pause();
-	cpuidle_use_deepest_state(false);
+	put_online_cpus();
+
+	spin_lock_irq(&suspend_freeze_lock);
+
+ out:
+	suspend_freeze_state = FREEZE_STATE_NONE;
+	spin_unlock_irq(&suspend_freeze_lock);
 }
 
 void freeze_wake(void)
 {
-	suspend_freeze_wake = true;
-	wake_up(&suspend_freeze_wait_head);
+	unsigned long flags;
+
+	spin_lock_irqsave(&suspend_freeze_lock, flags);
+	if (suspend_freeze_state > FREEZE_STATE_NONE) {
+		suspend_freeze_state = FREEZE_STATE_WAKE;
+		wake_up(&suspend_freeze_wait_head);
+	}
+	spin_unlock_irqrestore(&suspend_freeze_lock, flags);
 }
 EXPORT_SYMBOL_GPL(freeze_wake);
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index aaf1c1d5cf5d..94b2d7b88a27 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
+#include <linux/suspend.h>
 
 #include <asm/tlb.h>
 
@@ -104,6 +105,21 @@ static void cpuidle_idle_call(void)
 	 */
 	rcu_idle_enter();
 
+	/*
+	 * Suspend-to-idle ("freeze") is a system state in which all user space
+	 * has been frozen, all I/O devices have been suspended and the only
+	 * activity happens here and in iterrupts (if any).  In that case bypass
+	 * the cpuidle governor and go stratight for the deepest idle state
+	 * available.  Possibly also suspend the local tick and the entire
+	 * timekeeping to prevent timer interrupts from kicking us out of idle
+	 * until a proper wakeup interrupt happens.
+	 */
+	if (idle_should_freeze()) {
+		cpuidle_enter_freeze();
+		local_irq_enable();
+		goto exit_idle;
+	}
+
 	/*
 	 * Ask the cpuidle framework to choose a convenient idle state.
 	 * Fall back to the default arch idle method on errors.
-- 
cgit v1.2.3


From 9b6c2d2e2ba5280649eb043cbc7e3483c77e5d69 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Fri, 13 Feb 2015 14:35:57 -0800
Subject: lib/bitmap.c: change prototype of bitmap_copy_le

Make the prototype of bitmap_copy_le the same as bitmap_copy's.  All other
bitmap_* functions take unsigned long* parameters; there's no reason this
should be special.

The only current user is the static inline uwb_mas_bm_copy_le, which
already does the void* laundering, so the end users can pass their u8 or
__le32 buffers without a cast.

Furthermore, this allows us to simply let bitmap_copy_le be an alias for
bitmap_copy on little-endian; see next patch.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 2 +-
 lib/bitmap.c           | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 5f5c00de39f0..334fe32d8f0e 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -170,7 +170,7 @@ extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
-extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
+extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
 extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
 extern int bitmap_print_to_pagebuf(bool list, char *buf,
 				   const unsigned long *maskp, int nmaskbits);
diff --git a/lib/bitmap.c b/lib/bitmap.c
index ad161a6c82db..e4ac20bec76c 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -1191,16 +1191,15 @@ EXPORT_SYMBOL(bitmap_allocate_region);
  *
  * Require nbits % BITS_PER_LONG == 0.
  */
-void bitmap_copy_le(void *dst, const unsigned long *src, int nbits)
+void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits)
 {
-	unsigned long *d = dst;
-	int i;
+	unsigned int i;
 
 	for (i = 0; i < nbits/BITS_PER_LONG; i++) {
 		if (BITS_PER_LONG == 64)
-			d[i] = cpu_to_le64(src[i]);
+			dst[i] = cpu_to_le64(src[i]);
 		else
-			d[i] = cpu_to_le32(src[i]);
+			dst[i] = cpu_to_le32(src[i]);
 	}
 }
 EXPORT_SYMBOL(bitmap_copy_le);
-- 
cgit v1.2.3


From e8f24278329dc31b3b8223c83a5465c9df153d9d Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Fri, 13 Feb 2015 14:36:00 -0800
Subject: lib/bitmap.c: elide bitmap_copy_le on little-endian

On little-endian, there's no reason to have an extra, presumably less
efficient, way of copying a bitmap.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 4 ++++
 lib/bitmap.c           | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 334fe32d8f0e..cffc89c23c02 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -170,7 +170,11 @@ extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
+#ifdef __BIG_ENDIAN
 extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
+#else
+#define bitmap_copy_le bitmap_copy
+#endif
 extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
 extern int bitmap_print_to_pagebuf(bool list, char *buf,
 				   const unsigned long *maskp, int nmaskbits);
diff --git a/lib/bitmap.c b/lib/bitmap.c
index e4ac20bec76c..d2cd50cd4f5d 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -1191,6 +1191,7 @@ EXPORT_SYMBOL(bitmap_allocate_region);
  *
  * Require nbits % BITS_PER_LONG == 0.
  */
+#ifdef __BIG_ENDIAN
 void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits)
 {
 	unsigned int i;
@@ -1203,3 +1204,4 @@ void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int n
 	}
 }
 EXPORT_SYMBOL(bitmap_copy_le);
+#endif
-- 
cgit v1.2.3


From 2fbad29917c9852fa018d572cd3d43a13465d0f8 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Fri, 13 Feb 2015 14:36:02 -0800
Subject: lib: bitmap: change bitmap_shift_right to take unsigned parameters

I've previously changed the nbits parameter of most bitmap_* functions to
unsigned; now it is bitmap_shift_{left,right}'s turn.  This alone saves
some .text, but while at it I found that there were a few other things one
could do.  The end result of these seven patches is

  $ scripts/bloat-o-meter /tmp/bitmap.o.{old,new}
  add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-328 (-328)
  function                                     old     new   delta
  __bitmap_shift_right                         384     226    -158
  __bitmap_shift_left                          306     136    -170

and less importantly also a smaller stack footprint

  $ stack-o-meter.pl master bitmap
  file                 function                       old  new  delta
  lib/bitmap.o         __bitmap_shift_right             24    8  -16
  lib/bitmap.o         __bitmap_shift_left              24    0  -24

For each pair of 0 <= shift <= nbits <= 256 I've tested the end result
with a few randomly filled src buffers (including garbage beyond nbits),
in each case verifying that the shift {left,right}-most bits of dst are
zero and the remaining nbits-shift bits correspond to src, so I'm fairly
confident I didn't screw up.  That hasn't stopped me from being wrong
before, though.

This patch (of 7):

gcc can generate slightly better code for stuff like "nbits %
BITS_PER_LONG" when it knows nbits is not negative.  Since negative size
bitmaps or shift amounts don't make sense, change these parameters of
bitmap_shift_right to unsigned.

The expressions involving "lim - 1" are still ok, since if lim is 0 the
loop is never executed.

Also use "shift" and "nbits" consistently for the parameter names.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 12 ++++++------
 lib/bitmap.c           | 10 +++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index cffc89c23c02..c168a807ab9a 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -96,8 +96,8 @@ extern int __bitmap_equal(const unsigned long *bitmap1,
 			  const unsigned long *bitmap2, unsigned int nbits);
 extern void __bitmap_complement(unsigned long *dst, const unsigned long *src,
 			unsigned int nbits);
-extern void __bitmap_shift_right(unsigned long *dst,
-                        const unsigned long *src, int shift, int bits);
+extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
+				unsigned int shift, unsigned int nbits);
 extern void __bitmap_shift_left(unsigned long *dst,
                         const unsigned long *src, int shift, int bits);
 extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
@@ -313,13 +313,13 @@ static inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
 	return __bitmap_weight(src, nbits);
 }
 
-static inline void bitmap_shift_right(unsigned long *dst,
-			const unsigned long *src, int n, int nbits)
+static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
+				unsigned int shift, int nbits)
 {
 	if (small_const_nbits(nbits))
-		*dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> n;
+		*dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift;
 	else
-		__bitmap_shift_right(dst, src, n, nbits);
+		__bitmap_shift_right(dst, src, shift, nbits);
 }
 
 static inline void bitmap_shift_left(unsigned long *dst,
diff --git a/lib/bitmap.c b/lib/bitmap.c
index d2cd50cd4f5d..45e7d14ebdfd 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -104,17 +104,17 @@ EXPORT_SYMBOL(__bitmap_complement);
  *   @dst : destination bitmap
  *   @src : source bitmap
  *   @shift : shift by this many bits
- *   @bits : bitmap size, in bits
+ *   @nbits : bitmap size, in bits
  *
  * Shifting right (dividing) means moving bits in the MS -> LS bit
  * direction.  Zeros are fed into the vacated MS positions and the
  * LS bits shifted off the bottom are lost.
  */
-void __bitmap_shift_right(unsigned long *dst,
-			const unsigned long *src, int shift, int bits)
+void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
+			unsigned shift, unsigned nbits)
 {
-	int k, lim = BITS_TO_LONGS(bits), left = bits % BITS_PER_LONG;
-	int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
+	unsigned k, lim = BITS_TO_LONGS(nbits), left = nbits % BITS_PER_LONG;
+	unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
 	unsigned long mask = (1UL << left) - 1;
 	for (k = 0; off + k < lim; ++k) {
 		unsigned long upper, lower;
-- 
cgit v1.2.3


From dba94c2553da1928303c2a6c6410247c88cafc1d Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Fri, 13 Feb 2015 14:36:13 -0800
Subject: lib: bitmap: change bitmap_shift_left to take unsigned parameters

gcc can generate slightly better code for stuff like "nbits %
BITS_PER_LONG" when it knows nbits is not negative.  Since negative size
bitmaps or shift amounts don't make sense, change these parameters of
bitmap_shift_right to unsigned.

If off >= lim (which requires shift >= nbits), k is initialized with a
large positive value, but since I've let k continue to be signed, the loop
will never run and dst will be zeroed as expected.  Inside the loop, k is
guaranteed to be non-negative, so the fact that it is promoted to unsigned
in the various expressions it appears in is harmless.

Also use "shift" and "nbits" consistently for the parameter names.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 12 ++++++------
 lib/bitmap.c           | 11 ++++++-----
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index c168a807ab9a..5e7f75a6d7d0 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -98,8 +98,8 @@ extern void __bitmap_complement(unsigned long *dst, const unsigned long *src,
 			unsigned int nbits);
 extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
 				unsigned int shift, unsigned int nbits);
-extern void __bitmap_shift_left(unsigned long *dst,
-                        const unsigned long *src, int shift, int bits);
+extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
+				unsigned int shift, unsigned int nbits);
 extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
 			const unsigned long *bitmap2, unsigned int nbits);
 extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
@@ -322,13 +322,13 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *s
 		__bitmap_shift_right(dst, src, shift, nbits);
 }
 
-static inline void bitmap_shift_left(unsigned long *dst,
-			const unsigned long *src, int n, int nbits)
+static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
+				unsigned int shift, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
-		*dst = (*src << n) & BITMAP_LAST_WORD_MASK(nbits);
+		*dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits);
 	else
-		__bitmap_shift_left(dst, src, n, nbits);
+		__bitmap_shift_left(dst, src, shift, nbits);
 }
 
 static inline int bitmap_parse(const char *buf, unsigned int buflen,
diff --git a/lib/bitmap.c b/lib/bitmap.c
index db88512c3451..74bdf3601245 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -148,18 +148,19 @@ EXPORT_SYMBOL(__bitmap_shift_right);
  *   @dst : destination bitmap
  *   @src : source bitmap
  *   @shift : shift by this many bits
- *   @bits : bitmap size, in bits
+ *   @nbits : bitmap size, in bits
  *
  * Shifting left (multiplying) means moving bits in the LS -> MS
  * direction.  Zeros are fed into the vacated LS bit positions
  * and those MS bits shifted off the top are lost.
  */
 
-void __bitmap_shift_left(unsigned long *dst,
-			const unsigned long *src, int shift, int bits)
+void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
+			unsigned int shift, unsigned int nbits)
 {
-	int k, lim = BITS_TO_LONGS(bits), left = bits % BITS_PER_LONG;
-	int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
+	int k;
+	unsigned int lim = BITS_TO_LONGS(nbits), left = nbits % BITS_PER_LONG;
+	unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
 	for (k = lim - off - 1; k >= 0; --k) {
 		unsigned long upper, lower;
 
-- 
cgit v1.2.3


From a4bb1e43e22d3cade8f942fc6f95920248eb2fd0 Mon Sep 17 00:00:00 2001
From: Andrzej Hajda <a.hajda@samsung.com>
Date: Fri, 13 Feb 2015 14:36:24 -0800
Subject: mm/util: add kstrdup_const

kstrdup() is often used to duplicate strings where neither source neither
destination will be ever modified.  In such case we can just reuse the
source instead of duplicating it.  The problem is that we must be sure
that the source is non-modifiable and its life-time is long enough.

I suspect the good candidates for such strings are strings located in
kernel .rodata section, they cannot be modifed because the section is
read-only and their life-time is equal to kernel life-time.

This small patchset proposes alternative version of kstrdup -
kstrdup_const, which returns source string if it is located in .rodata
otherwise it fallbacks to kstrdup.  To verify if the source is in
.rodata function checks if the address is between sentinels
__start_rodata, __end_rodata.  I guess it should work with all
architectures.

The main patch is accompanied by four patches constifying kstrdup for
cases where situtation described above happens frequently.

I have tested the patchset on mobile platform (exynos4210-trats) and it
saves 3272 string allocations.  Since minimal allocation is 32 or 64
bytes depending on Kconfig options the patchset saves respectively about
100KB or 200KB of memory.

Stats from tested platform show that the main offender is sysfs:

By caller:
  2260 __kernfs_new_node
    631 clk_register+0xc8/0x1b8
    318 clk_register+0x34/0x1b8
      51 kmem_cache_create
      12 alloc_vfsmnt

By string (with count >= 5):
    883 power
    876 subsystem
    135 parameters
    132 device
     61 iommu_group
    ...

This patch (of 5):

Add an alternative version of kstrdup which returns pointer to constant
char array.  The function checks if input string is in persistent and
read-only memory section, if yes it returns the input string, otherwise it
fallbacks to kstrdup.

kstrdup_const is accompanied by kfree_const performing conditional memory
deallocation of the string.

Signed-off-by: Andrzej Hajda <a.hajda@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Mike Turquette <mturquette@linaro.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Greg KH <greg@kroah.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h |  3 +++
 mm/util.c              | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index b9bc9a5d9e21..e40099e585c9 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -112,7 +112,10 @@ extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 void *memchr_inv(const void *s, int c, size_t n);
 
+extern void kfree_const(const void *x);
+
 extern char *kstrdup(const char *s, gfp_t gfp);
+extern const char *kstrdup_const(const char *s, gfp_t gfp);
 extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
 extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
 
diff --git a/mm/util.c b/mm/util.c
index f3ef639c4857..3981ae9d1b15 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -12,10 +12,30 @@
 #include <linux/hugetlb.h>
 #include <linux/vmalloc.h>
 
+#include <asm/sections.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
 
+static inline int is_kernel_rodata(unsigned long addr)
+{
+	return addr >= (unsigned long)__start_rodata &&
+		addr < (unsigned long)__end_rodata;
+}
+
+/**
+ * kfree_const - conditionally free memory
+ * @x: pointer to the memory
+ *
+ * Function calls kfree only if @x is not in .rodata section.
+ */
+void kfree_const(const void *x)
+{
+	if (!is_kernel_rodata((unsigned long)x))
+		kfree(x);
+}
+EXPORT_SYMBOL(kfree_const);
+
 /**
  * kstrdup - allocate space for and copy an existing string
  * @s: the string to duplicate
@@ -37,6 +57,24 @@ char *kstrdup(const char *s, gfp_t gfp)
 }
 EXPORT_SYMBOL(kstrdup);
 
+/**
+ * kstrdup_const - conditionally duplicate an existing const string
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Function returns source string if it is in .rodata section otherwise it
+ * fallbacks to kstrdup.
+ * Strings allocated by kstrdup_const should be freed by kfree_const.
+ */
+const char *kstrdup_const(const char *s, gfp_t gfp)
+{
+	if (is_kernel_rodata((unsigned long)s))
+		return s;
+
+	return kstrdup(s, gfp);
+}
+EXPORT_SYMBOL(kstrdup_const);
+
 /**
  * kstrndup - allocate space for and copy an existing string
  * @s: the string to duplicate
-- 
cgit v1.2.3


From dfeb0750b630b72b5d4fb2461bc7179eceb54666 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Feb 2015 14:36:31 -0800
Subject: kernfs: remove KERNFS_STATIC_NAME

When a new kernfs node is created, KERNFS_STATIC_NAME is used to avoid
making a separate copy of its name.  It's currently only used for sysfs
attributes whose filenames are required to stay accessible and unchanged.
There are rare exceptions where these names are allocated and formatted
dynamically but for the vast majority of cases they're consts in the
rodata section.

Now that kernfs is converted to use kstrdup_const() and kfree_const(),
there's little point in keeping KERNFS_STATIC_NAME around.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrzej Hajda <a.hajda@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/kernfs/dir.c        | 20 ++++++++------------
 fs/kernfs/file.c       |  4 ----
 fs/sysfs/file.c        |  2 +-
 include/linux/kernfs.h |  7 ++-----
 kernel/cgroup.c        |  2 +-
 5 files changed, 12 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 35e40879860a..6acc9648f986 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -411,8 +411,9 @@ void kernfs_put(struct kernfs_node *kn)
 
 	if (kernfs_type(kn) == KERNFS_LINK)
 		kernfs_put(kn->symlink.target_kn);
-	if (!(kn->flags & KERNFS_STATIC_NAME))
-		kfree_const(kn->name);
+
+	kfree_const(kn->name);
+
 	if (kn->iattr) {
 		if (kn->iattr->ia_secdata)
 			security_release_secctx(kn->iattr->ia_secdata,
@@ -506,15 +507,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 					     const char *name, umode_t mode,
 					     unsigned flags)
 {
-	const char *dup_name = NULL;
 	struct kernfs_node *kn;
 	int ret;
 
-	if (!(flags & KERNFS_STATIC_NAME)) {
-		name = dup_name = kstrdup_const(name, GFP_KERNEL);
-		if (!name)
-			return NULL;
-	}
+	name = kstrdup_const(name, GFP_KERNEL);
+	if (!name)
+		return NULL;
 
 	kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
 	if (!kn)
@@ -538,7 +536,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
  err_out2:
 	kmem_cache_free(kernfs_node_cache, kn);
  err_out1:
-	kfree_const(dup_name);
+	kfree_const(name);
 	return NULL;
 }
 
@@ -1285,9 +1283,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 
 	kn->ns = new_ns;
 	if (new_name) {
-		if (!(kn->flags & KERNFS_STATIC_NAME))
-			old_name = kn->name;
-		kn->flags &= ~KERNFS_STATIC_NAME;
+		old_name = kn->name;
 		kn->name = new_name;
 	}
 
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index ddc9f9612f16..b684e8a132e6 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -901,7 +901,6 @@ const struct file_operations kernfs_file_fops = {
  * @ops: kernfs operations for the file
  * @priv: private data for the file
  * @ns: optional namespace tag of the file
- * @name_is_static: don't copy file name
  * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
  *
  * Returns the created node on success, ERR_PTR() value on error.
@@ -911,7 +910,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 					 umode_t mode, loff_t size,
 					 const struct kernfs_ops *ops,
 					 void *priv, const void *ns,
-					 bool name_is_static,
 					 struct lock_class_key *key)
 {
 	struct kernfs_node *kn;
@@ -919,8 +917,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 	int rc;
 
 	flags = KERNFS_FILE;
-	if (name_is_static)
-		flags |= KERNFS_STATIC_NAME;
 
 	kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
 	if (!kn)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dfe928a9540f..7c2867b44141 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -295,7 +295,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
 		key = attr->key ?: (struct lock_class_key *)&attr->skey;
 #endif
 	kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
-				  (void *)attr, ns, true, key);
+				  (void *)attr, ns, key);
 	if (IS_ERR(kn)) {
 		if (PTR_ERR(kn) == -EEXIST)
 			sysfs_warn_dup(parent, attr->name);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index d4e01b358341..71ecdab1671b 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -43,7 +43,6 @@ enum kernfs_node_flag {
 	KERNFS_HAS_SEQ_SHOW	= 0x0040,
 	KERNFS_HAS_MMAP		= 0x0080,
 	KERNFS_LOCKDEP		= 0x0100,
-	KERNFS_STATIC_NAME	= 0x0200,
 	KERNFS_SUICIDAL		= 0x0400,
 	KERNFS_SUICIDED		= 0x0800,
 };
@@ -291,7 +290,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 					 umode_t mode, loff_t size,
 					 const struct kernfs_ops *ops,
 					 void *priv, const void *ns,
-					 bool name_is_static,
 					 struct lock_class_key *key);
 struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 				       const char *name,
@@ -369,8 +367,7 @@ kernfs_create_dir_ns(struct kernfs_node *parent, const char *name,
 static inline struct kernfs_node *
 __kernfs_create_file(struct kernfs_node *parent, const char *name,
 		     umode_t mode, loff_t size, const struct kernfs_ops *ops,
-		     void *priv, const void *ns, bool name_is_static,
-		     struct lock_class_key *key)
+		     void *priv, const void *ns, struct lock_class_key *key)
 { return ERR_PTR(-ENOSYS); }
 
 static inline struct kernfs_node *
@@ -439,7 +436,7 @@ kernfs_create_file_ns(struct kernfs_node *parent, const char *name,
 	key = (struct lock_class_key *)&ops->lockdep_key;
 #endif
 	return __kernfs_create_file(parent, name, mode, size, ops, priv, ns,
-				    false, key);
+				    key);
 }
 
 static inline struct kernfs_node *
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d5f6ec251fb2..29a7b2cc593e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3077,7 +3077,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 #endif
 	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
 				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
-				  NULL, false, key);
+				  NULL, key);
 	if (IS_ERR(kn))
 		return PTR_ERR(kn);
 
-- 
cgit v1.2.3


From 513e3d2d11c9f05db1edc70deb18a82555cf9309 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Feb 2015 14:36:50 -0800
Subject: cpumask: always use nr_cpu_ids in formatting and parsing functions

bitmap implements two variants of scnprintf functions to format a bitmap
into a string and cpumask and nodemask wrap them to provide equivalent
interfaces.  The scnprintf family of functions require a string buffer as
an output target which complicates code paths which just want to print out
the mask through printk for informational or debug purposes as they have
to worry about how large the buffer should be and whether it's too large
to allocate on stack.

Neither cpumask or nodemask provides a guildeline on how large the target
buffer should be forcing users come up with their own solutions - some
allocate an arbitrarily sized buffer which is small enough to allocate on
stack but may be too short in corner cases, other come up with a custom
upper limit calculation considering the output format, some allocate the
buffer dynamically while one resorted to using lock to synchronize access
to a static buffer.

This is an artificial problem which is being solved repeatedly for no
benefit.  In a lot of cases, the output area already exists and can be
targeted directly making the intermediate buffer unnecessary.  This
patchset teaches printf family of functions how to format bitmaps and
replace the dedicated formatting functions with it.

Pointer formatting is extended to cover bitmap formatting.  It uses the
field width for the number of bits instead of precision.  The format used
is '%*pb[l]', with the optional trailing 'l' specifying list format
instead of hex masks.  For more details, please see 0002.

This patch (of 31):

Currently, the formatting and parsing functions in cpumask.h use
nr_cpumask_bits like other cpumask functions; however, nr_cpumask_bits
is either NR_CPUS or nr_cpu_ids depending on CONFIG_CPUMASK_OFFSTACK.
This leads to inconsistent behaviors.

With CONFIG_NR_CPUS=512 and !CONFIG_CPUMASK_OFFSTACK

  # cat /sys/devices/virtual/net/lo/queues/rx-0/rps_cpus
  00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000
  # cat /proc/self/status | grep Cpus_allowed:
  Cpus_allowed:   f

With CONFIG_NR_CPUS=1024 and CONFIG_CPUMASK_OFFSTACK (fedora default)

  # cat /sys/devices/virtual/net/lo/queues/rx-0/rps_cpus
  0
  # cat /proc/self/status | grep Cpus_allowed:
  Cpus_allowed:   f

Note that /proc/self/status is always using nr_cpu_ids regardless of
config.  This is because seq cpumask formattings functions always use
nr_cpu_ids.

Given that the same output fields may switch between the two forms,
converging on nr_cpu_ids always isn't too likely to surprise userland.
This patch updates the formatting and parsing functions in cpumask.h
to always use nr_cpu_ids.  There's no point in dealing with CPUs which
aren't even possible on the machine.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: "John W. Linville" <linville@tuxdriver.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Mike Travis <travis@sgi.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index ff9044286d88..ee9acb0ce542 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -550,7 +550,7 @@ static inline void cpumask_copy(struct cpumask *dstp,
 static inline int cpumask_scnprintf(char *buf, int len,
 				    const struct cpumask *srcp)
 {
-	return bitmap_scnprintf(buf, len, cpumask_bits(srcp), nr_cpumask_bits);
+	return bitmap_scnprintf(buf, len, cpumask_bits(srcp), nr_cpu_ids);
 }
 
 /**
@@ -564,7 +564,7 @@ static inline int cpumask_scnprintf(char *buf, int len,
 static inline int cpumask_parse_user(const char __user *buf, int len,
 				     struct cpumask *dstp)
 {
-	return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
+	return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpu_ids);
 }
 
 /**
@@ -579,7 +579,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
 				     struct cpumask *dstp)
 {
 	return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
-							nr_cpumask_bits);
+				     nr_cpu_ids);
 }
 
 /**
@@ -595,7 +595,7 @@ static inline int cpulist_scnprintf(char *buf, int len,
 				    const struct cpumask *srcp)
 {
 	return bitmap_scnlistprintf(buf, len, cpumask_bits(srcp),
-				    nr_cpumask_bits);
+				    nr_cpu_ids);
 }
 
 /**
@@ -610,7 +610,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
 	char *nl = strchr(buf, '\n');
 	unsigned int len = nl ? (unsigned int)(nl - buf) : strlen(buf);
 
-	return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
+	return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpu_ids);
 }
 
 /**
@@ -622,7 +622,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
  */
 static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
 {
-	return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
+	return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpu_ids);
 }
 
 /**
@@ -817,7 +817,7 @@ static inline ssize_t
 cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
 {
 	return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
-				      nr_cpumask_bits);
+				      nr_cpu_ids);
 }
 
 /*
-- 
cgit v1.2.3


From f1bbc032e45106400905ebb47550983af4690b0b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Feb 2015 14:36:57 -0800
Subject: cpumask, nodemask: implement cpumask/nodemask_pr_args()

printf family of functions can now format bitmaps using '%*pb[l]' and
all cpumask and nodemask formatting will be converted to use it.  To
ease printing these masks with '%*pb[l]' which require two params -
the number of bits and the actual bitmap, this patch implement
cpumask_pr_args() and nodemask_pr_args() which can be used to provide
arguments for '%*pb[l]'

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: "John W. Linville" <linville@tuxdriver.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Mike Travis <travis@sgi.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h  | 8 ++++++++
 include/linux/nodemask.h | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index ee9acb0ce542..a9b3d00915a0 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -22,6 +22,14 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
  */
 #define cpumask_bits(maskp) ((maskp)->bits)
 
+/**
+ * cpumask_pr_args - printf args to output a cpumask
+ * @maskp: cpumask to be printed
+ *
+ * Can be used to provide arguments for '%*pb[l]' when printing a cpumask.
+ */
+#define cpumask_pr_args(maskp)		nr_cpu_ids, cpumask_bits(maskp)
+
 #if NR_CPUS == 1
 #define nr_cpu_ids		1
 #else
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 21cef483dc1b..10f8e556ba07 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -98,6 +98,14 @@
 typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
 extern nodemask_t _unused_nodemask_arg_;
 
+/**
+ * nodemask_pr_args - printf args to output a nodemask
+ * @maskp: nodemask to be printed
+ *
+ * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
+ */
+#define nodemask_pr_args(maskp)		MAX_NUMNODES, (maskp)->bits
+
 /*
  * The inline keyword gives the compiler room to decide to inline, or
  * not inline a function as it sees best.  However, as these functions
-- 
cgit v1.2.3


From 46385326cc1577587ed3e7432c2425cf6d3e4308 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Feb 2015 14:38:15 -0800
Subject: bitmap, cpumask, nodemask: remove dedicated formatting functions

Now that all bitmap formatting usages have been converted to
'%*pb[l]', the separate formatting functions are unnecessary.  The
following functions are removed.

* bitmap_scn[list]printf()
* cpumask_scnprintf(), cpulist_scnprintf()
* [__]nodemask_scnprintf(), [__]nodelist_scnprintf()
* seq_bitmap[_list](), seq_cpumask[_list](), seq_nodemask[_list]()
* seq_buf_bitmask()

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c            | 32 --------------------------------
 include/linux/bitmap.h   |  7 -------
 include/linux/cpumask.h  | 31 -------------------------------
 include/linux/nodemask.h | 33 +++++++--------------------------
 include/linux/seq_buf.h  |  3 ---
 include/linux/seq_file.h | 25 -------------------------
 lib/bitmap.c             | 41 -----------------------------------------
 lib/seq_buf.c            | 36 ------------------------------------
 8 files changed, 7 insertions(+), 201 deletions(-)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index dbf3a59c86bb..555f82155be8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -539,38 +539,6 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
 	return res;
 }
 
-int seq_bitmap(struct seq_file *m, const unsigned long *bits,
-				   unsigned int nr_bits)
-{
-	if (m->count < m->size) {
-		int len = bitmap_scnprintf(m->buf + m->count,
-				m->size - m->count, bits, nr_bits);
-		if (m->count + len < m->size) {
-			m->count += len;
-			return 0;
-		}
-	}
-	seq_set_overflow(m);
-	return -1;
-}
-EXPORT_SYMBOL(seq_bitmap);
-
-int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
-		unsigned int nr_bits)
-{
-	if (m->count < m->size) {
-		int len = bitmap_scnlistprintf(m->buf + m->count,
-				m->size - m->count, bits, nr_bits);
-		if (m->count + len < m->size) {
-			m->count += len;
-			return 0;
-		}
-	}
-	seq_set_overflow(m);
-	return -1;
-}
-EXPORT_SYMBOL(seq_bitmap_list);
-
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
 	return NULL + (*pos == 0);
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 5e7f75a6d7d0..dbfbf4990005 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -52,16 +52,13 @@
  * bitmap_bitremap(oldbit, old, new, nbits)	newbit = map(old, new)(oldbit)
  * bitmap_onto(dst, orig, relmap, nbits)	*dst = orig relative to relmap
  * bitmap_fold(dst, orig, sz, nbits)		dst bits = orig bits mod sz
- * bitmap_scnprintf(buf, len, src, nbits)	Print bitmap src to buf
  * bitmap_parse(buf, buflen, dst, nbits)	Parse bitmap dst from kernel buf
  * bitmap_parse_user(ubuf, ulen, dst, nbits)	Parse bitmap dst from user buf
- * bitmap_scnlistprintf(buf, len, src, nbits)	Print bitmap src as list to buf
  * bitmap_parselist(buf, dst, nbits)		Parse bitmap dst from kernel buf
  * bitmap_parselist_user(buf, dst, nbits)	Parse bitmap dst from user buf
  * bitmap_find_free_region(bitmap, bits, order)	Find and allocate bit region
  * bitmap_release_region(bitmap, pos, order)	Free specified bit region
  * bitmap_allocate_region(bitmap, pos, order)	Allocate specified bit region
- * bitmap_print_to_pagebuf(list, buf, mask, nbits) Print bitmap src as list/hex
  */
 
 /*
@@ -147,14 +144,10 @@ bitmap_find_next_zero_area(unsigned long *map,
 					      align_mask, 0);
 }
 
-extern int bitmap_scnprintf(char *buf, unsigned int len,
-			const unsigned long *src, int nbits);
 extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user,
 			unsigned long *dst, int nbits);
 extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
 			unsigned long *dst, int nbits);
-extern int bitmap_scnlistprintf(char *buf, unsigned int len,
-			const unsigned long *src, int nbits);
 extern int bitmap_parselist(const char *buf, unsigned long *maskp,
 			int nmaskbits);
 extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index a9b3d00915a0..086549a665e2 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -546,21 +546,6 @@ static inline void cpumask_copy(struct cpumask *dstp,
  */
 #define cpumask_of(cpu) (get_cpu_mask(cpu))
 
-/**
- * cpumask_scnprintf - print a cpumask into a string as comma-separated hex
- * @buf: the buffer to sprintf into
- * @len: the length of the buffer
- * @srcp: the cpumask to print
- *
- * If len is zero, returns zero.  Otherwise returns the length of the
- * (nul-terminated) @buf string.
- */
-static inline int cpumask_scnprintf(char *buf, int len,
-				    const struct cpumask *srcp)
-{
-	return bitmap_scnprintf(buf, len, cpumask_bits(srcp), nr_cpu_ids);
-}
-
 /**
  * cpumask_parse_user - extract a cpumask from a user string
  * @buf: the buffer to extract from
@@ -590,22 +575,6 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
 				     nr_cpu_ids);
 }
 
-/**
- * cpulist_scnprintf - print a cpumask into a string as comma-separated list
- * @buf: the buffer to sprintf into
- * @len: the length of the buffer
- * @srcp: the cpumask to print
- *
- * If len is zero, returns zero.  Otherwise returns the length of the
- * (nul-terminated) @buf string.
- */
-static inline int cpulist_scnprintf(char *buf, int len,
-				    const struct cpumask *srcp)
-{
-	return bitmap_scnlistprintf(buf, len, cpumask_bits(srcp),
-				    nr_cpu_ids);
-}
-
 /**
  * cpumask_parse - extract a cpumask from from a string
  * @buf: the buffer to extract from
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 10f8e556ba07..6e85889cf9ab 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -8,14 +8,13 @@
  * See detailed comments in the file linux/bitmap.h describing the
  * data type on which these nodemasks are based.
  *
- * For details of nodemask_scnprintf() and nodemask_parse_user(),
- * see bitmap_scnprintf() and bitmap_parse_user() in lib/bitmap.c.
- * For details of nodelist_scnprintf() and nodelist_parse(), see
- * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
- * For details of node_remap(), see bitmap_bitremap in lib/bitmap.c.
- * For details of nodes_remap(), see bitmap_remap in lib/bitmap.c.
- * For details of nodes_onto(), see bitmap_onto in lib/bitmap.c.
- * For details of nodes_fold(), see bitmap_fold in lib/bitmap.c.
+ * For details of nodemask_parse_user(), see bitmap_parse_user() in
+ * lib/bitmap.c.  For details of nodelist_parse(), see bitmap_parselist(),
+ * also in bitmap.c.  For details of node_remap(), see bitmap_bitremap in
+ * lib/bitmap.c.  For details of nodes_remap(), see bitmap_remap in
+ * lib/bitmap.c.  For details of nodes_onto(), see bitmap_onto in
+ * lib/bitmap.c.  For details of nodes_fold(), see bitmap_fold in
+ * lib/bitmap.c.
  *
  * The available nodemask operations are:
  *
@@ -52,9 +51,7 @@
  * NODE_MASK_NONE			Initializer - no bits set
  * unsigned long *nodes_addr(mask)	Array of unsigned long's in mask
  *
- * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
  * int nodemask_parse_user(ubuf, ulen, mask)	Parse ascii string as nodemask
- * int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing
  * int nodelist_parse(buf, map)		Parse ascii string as nodelist
  * int node_remap(oldbit, old, new)	newbit = map(old, new)(oldbit)
  * void nodes_remap(dst, src, old, new)	*dst = map(old, new)(src)
@@ -312,14 +309,6 @@ static inline int __first_unset_node(const nodemask_t *maskp)
 
 #define nodes_addr(src) ((src).bits)
 
-#define nodemask_scnprintf(buf, len, src) \
-			__nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES)
-static inline int __nodemask_scnprintf(char *buf, int len,
-					const nodemask_t *srcp, int nbits)
-{
-	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
-}
-
 #define nodemask_parse_user(ubuf, ulen, dst) \
 		__nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
 static inline int __nodemask_parse_user(const char __user *buf, int len,
@@ -328,14 +317,6 @@ static inline int __nodemask_parse_user(const char __user *buf, int len,
 	return bitmap_parse_user(buf, len, dstp->bits, nbits);
 }
 
-#define nodelist_scnprintf(buf, len, src) \
-			__nodelist_scnprintf((buf), (len), &(src), MAX_NUMNODES)
-static inline int __nodelist_scnprintf(char *buf, int len,
-					const nodemask_t *srcp, int nbits)
-{
-	return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
-}
-
 #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
 static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
 {
diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 9aafe0e24c68..fb7eb9ccb1cd 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -125,9 +125,6 @@ extern int seq_buf_putmem_hex(struct seq_buf *s, const void *mem,
 			      unsigned int len);
 extern int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc);
 
-extern int seq_buf_bitmask(struct seq_buf *s, const unsigned long *maskp,
-			   int nmaskbits);
-
 #ifdef CONFIG_BINARY_PRINTF
 extern int
 seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary);
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index cf6a9daaaf6d..afbb1fd77c77 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -126,31 +126,6 @@ int seq_path(struct seq_file *, const struct path *, const char *);
 int seq_dentry(struct seq_file *, struct dentry *, const char *);
 int seq_path_root(struct seq_file *m, const struct path *path,
 		  const struct path *root, const char *esc);
-int seq_bitmap(struct seq_file *m, const unsigned long *bits,
-				   unsigned int nr_bits);
-static inline int seq_cpumask(struct seq_file *m, const struct cpumask *mask)
-{
-	return seq_bitmap(m, cpumask_bits(mask), nr_cpu_ids);
-}
-
-static inline int seq_nodemask(struct seq_file *m, nodemask_t *mask)
-{
-	return seq_bitmap(m, mask->bits, MAX_NUMNODES);
-}
-
-int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
-		unsigned int nr_bits);
-
-static inline int seq_cpumask_list(struct seq_file *m,
-				   const struct cpumask *mask)
-{
-	return seq_bitmap_list(m, cpumask_bits(mask), nr_cpu_ids);
-}
-
-static inline int seq_nodemask_list(struct seq_file *m, nodemask_t *mask)
-{
-	return seq_bitmap_list(m, mask->bits, MAX_NUMNODES);
-}
 
 int single_open(struct file *, int (*)(struct seq_file *, void *), void *);
 int single_open_size(struct file *, int (*)(struct seq_file *, void *), void *, size_t);
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 088adbdcbad9..d456f4c15a9f 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -369,24 +369,6 @@ EXPORT_SYMBOL(bitmap_find_next_zero_area_off);
 #define nbits_to_hold_value(val)	fls(val)
 #define BASEDEC 10		/* fancier cpuset lists input in decimal */
 
-/**
- * bitmap_scnprintf - convert bitmap to an ASCII hex string.
- * @buf: byte buffer into which string is placed
- * @buflen: reserved size of @buf, in bytes
- * @maskp: pointer to bitmap to convert
- * @nmaskbits: size of bitmap, in bits
- *
- * Exactly @nmaskbits bits are displayed.  Hex digits are grouped into
- * comma-separated sets of eight digits per set.  Returns the number of
- * characters which were written to *buf, excluding the trailing \0.
- */
-int bitmap_scnprintf(char *buf, unsigned int buflen,
-	const unsigned long *maskp, int nmaskbits)
-{
-	return scnprintf(buf, buflen, "%*pb", nmaskbits, maskp);
-}
-EXPORT_SYMBOL(bitmap_scnprintf);
-
 /**
  * __bitmap_parse - convert an ASCII hex string into a bitmap.
  * @buf: pointer to buffer containing string.
@@ -500,29 +482,6 @@ int bitmap_parse_user(const char __user *ubuf,
 }
 EXPORT_SYMBOL(bitmap_parse_user);
 
-/**
- * bitmap_scnlistprintf - convert bitmap to list format ASCII string
- * @buf: byte buffer into which string is placed
- * @buflen: reserved size of @buf, in bytes
- * @maskp: pointer to bitmap to convert
- * @nmaskbits: size of bitmap, in bits
- *
- * Output format is a comma-separated list of decimal numbers and
- * ranges.  Consecutively set bits are shown as two hyphen-separated
- * decimal numbers, the smallest and largest bit numbers set in
- * the range.  Output format is compatible with the format
- * accepted as input by bitmap_parselist().
- *
- * The return value is the number of characters which were written to *buf
- * excluding the trailing '\0', as per ISO C99's scnprintf.
- */
-int bitmap_scnlistprintf(char *buf, unsigned int buflen,
-	const unsigned long *maskp, int nmaskbits)
-{
-	return scnprintf(buf, buflen, "%*pbl", nmaskbits, maskp);
-}
-EXPORT_SYMBOL(bitmap_scnlistprintf);
-
 /**
  * bitmap_print_to_pagebuf - convert bitmap to list or hex format ASCII string
  * @list: indicates whether the bitmap must be list
diff --git a/lib/seq_buf.c b/lib/seq_buf.c
index 4eedfedb9e31..88c0854bd752 100644
--- a/lib/seq_buf.c
+++ b/lib/seq_buf.c
@@ -91,42 +91,6 @@ int seq_buf_printf(struct seq_buf *s, const char *fmt, ...)
 	return ret;
 }
 
-/**
- * seq_buf_bitmask - write a bitmask array in its ASCII representation
- * @s:		seq_buf descriptor
- * @maskp:	points to an array of unsigned longs that represent a bitmask
- * @nmaskbits:	The number of bits that are valid in @maskp
- *
- * Writes a ASCII representation of a bitmask string into @s.
- *
- * Returns zero on success, -1 on overflow.
- */
-int seq_buf_bitmask(struct seq_buf *s, const unsigned long *maskp,
-		    int nmaskbits)
-{
-	unsigned int len = seq_buf_buffer_left(s);
-	int ret;
-
-	WARN_ON(s->size == 0);
-
-	/*
-	 * Note, because bitmap_scnprintf() only returns the number of bytes
-	 * written and not the number that would be written, we use the last
-	 * byte of the buffer to let us know if we overflowed. There's a small
-	 * chance that the bitmap could have fit exactly inside the buffer, but
-	 * it's not that critical if that does happen.
-	 */
-	if (len > 1) {
-		ret = bitmap_scnprintf(s->buffer + s->len, len, maskp, nmaskbits);
-		if (ret < len) {
-			s->len += ret;
-			return 0;
-		}
-	}
-	seq_buf_set_overflow(s);
-	return -1;
-}
-
 #ifdef CONFIG_BINARY_PRINTF
 /**
  * seq_buf_bprintf - Write the printf string from binary arguments
-- 
cgit v1.2.3


From cb4188ac8e5779f66b9f55888ac2c75b391cde44 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:14 -0800
Subject: compiler: introduce __alias(symbol) shortcut

To be consistent with other compiler attributes introduce __alias(symbol)
macro expanding into __attribute__((alias(#symbol)))

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 02ae99e8e6d3..cdf13ca7cac3 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -66,6 +66,7 @@
 #define __deprecated			__attribute__((deprecated))
 #define __packed			__attribute__((packed))
 #define __weak				__attribute__((weak))
+#define __alias(symbol)		__attribute__((alias(#symbol)))
 
 /*
  * it doesn't make sense on ARM (currently the only user of __naked) to trace
-- 
cgit v1.2.3


From 0b24becc810dc3be6e3f94103a866f214c282394 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:17 -0800
Subject: kasan: add kernel address sanitizer infrastructure

Kernel Address sanitizer (KASan) is a dynamic memory error detector.  It
provides fast and comprehensive solution for finding use-after-free and
out-of-bounds bugs.

KASAN uses compile-time instrumentation for checking every memory access,
therefore GCC > v4.9.2 required.  v4.9.2 almost works, but has issues with
putting symbol aliases into the wrong section, which breaks kasan
instrumentation of globals.

This patch only adds infrastructure for kernel address sanitizer.  It's
not available for use yet.  The idea and some code was borrowed from [1].

Basic idea:

The main idea of KASAN is to use shadow memory to record whether each byte
of memory is safe to access or not, and use compiler's instrumentation to
check the shadow memory on each memory access.

Address sanitizer uses 1/8 of the memory addressable in kernel for shadow
memory and uses direct mapping with a scale and offset to translate a
memory address to its corresponding shadow address.

Here is function to translate address to corresponding shadow address:

     unsigned long kasan_mem_to_shadow(unsigned long addr)
     {
                return (addr >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET;
     }

where KASAN_SHADOW_SCALE_SHIFT = 3.

So for every 8 bytes there is one corresponding byte of shadow memory.
The following encoding used for each shadow byte: 0 means that all 8 bytes
of the corresponding memory region are valid for access; k (1 <= k <= 7)
means that the first k bytes are valid for access, and other (8 - k) bytes
are not; Any negative value indicates that the entire 8-bytes are
inaccessible.  Different negative values used to distinguish between
different kinds of inaccessible memory (redzones, freed memory) (see
mm/kasan/kasan.h).

To be able to detect accesses to bad memory we need a special compiler.
Such compiler inserts a specific function calls (__asan_load*(addr),
__asan_store*(addr)) before each memory access of size 1, 2, 4, 8 or 16.

These functions check whether memory region is valid to access or not by
checking corresponding shadow memory.  If access is not valid an error
printed.

Historical background of the address sanitizer from Dmitry Vyukov:

	"We've developed the set of tools, AddressSanitizer (Asan),
	ThreadSanitizer and MemorySanitizer, for user space. We actively use
	them for testing inside of Google (continuous testing, fuzzing,
	running prod services). To date the tools have found more than 10'000
	scary bugs in Chromium, Google internal codebase and various
	open-source projects (Firefox, OpenSSL, gcc, clang, ffmpeg, MySQL and
	lots of others): [2] [3] [4].
	The tools are part of both gcc and clang compilers.

	We have not yet done massive testing under the Kernel AddressSanitizer
	(it's kind of chicken and egg problem, you need it to be upstream to
	start applying it extensively). To date it has found about 50 bugs.
	Bugs that we've found in upstream kernel are listed in [5].
	We've also found ~20 bugs in out internal version of the kernel. Also
	people from Samsung and Oracle have found some.

	[...]

	As others noted, the main feature of AddressSanitizer is its
	performance due to inline compiler instrumentation and simple linear
	shadow memory. User-space Asan has ~2x slowdown on computational
	programs and ~2x memory consumption increase. Taking into account that
	kernel usually consumes only small fraction of CPU and memory when
	running real user-space programs, I would expect that kernel Asan will
	have ~10-30% slowdown and similar memory consumption increase (when we
	finish all tuning).

	I agree that Asan can well replace kmemcheck. We have plans to start
	working on Kernel MemorySanitizer that finds uses of unitialized
	memory. Asan+Msan will provide feature-parity with kmemcheck. As
	others noted, Asan will unlikely replace debug slab and pagealloc that
	can be enabled at runtime. Asan uses compiler instrumentation, so even
	if it is disabled, it still incurs visible overheads.

	Asan technology is easily portable to other architectures. Compiler
	instrumentation is fully portable. Runtime has some arch-dependent
	parts like shadow mapping and atomic operation interception. They are
	relatively easy to port."

Comparison with other debugging features:
========================================

KMEMCHECK:

  - KASan can do almost everything that kmemcheck can.  KASan uses
    compile-time instrumentation, which makes it significantly faster than
    kmemcheck.  The only advantage of kmemcheck over KASan is detection of
    uninitialized memory reads.

    Some brief performance testing showed that kasan could be
    x500-x600 times faster than kmemcheck:

$ netperf -l 30
		MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to localhost (127.0.0.1) port 0 AF_INET
		Recv   Send    Send
		Socket Socket  Message  Elapsed
		Size   Size    Size     Time     Throughput
		bytes  bytes   bytes    secs.    10^6bits/sec

no debug:	87380  16384  16384    30.00    41624.72

kasan inline:	87380  16384  16384    30.00    12870.54

kasan outline:	87380  16384  16384    30.00    10586.39

kmemcheck: 	87380  16384  16384    30.03      20.23

  - Also kmemcheck couldn't work on several CPUs.  It always sets
    number of CPUs to 1.  KASan doesn't have such limitation.

DEBUG_PAGEALLOC:
	- KASan is slower than DEBUG_PAGEALLOC, but KASan works on sub-page
	  granularity level, so it able to find more bugs.

SLUB_DEBUG (poisoning, redzones):
	- SLUB_DEBUG has lower overhead than KASan.

	- SLUB_DEBUG in most cases are not able to detect bad reads,
	  KASan able to detect both reads and writes.

	- In some cases (e.g. redzone overwritten) SLUB_DEBUG detect
	  bugs only on allocation/freeing of object. KASan catch
	  bugs right before it will happen, so we always know exact
	  place of first bad read/write.

[1] https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel
[2] https://code.google.com/p/address-sanitizer/wiki/FoundBugs
[3] https://code.google.com/p/thread-sanitizer/wiki/FoundBugs
[4] https://code.google.com/p/memory-sanitizer/wiki/FoundBugs
[5] https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel#Trophies

Based on work by Andrey Konovalov.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Acked-by: Michal Marek <mmarek@suse.cz>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kasan.txt               | 170 +++++++++++++++++++
 Makefile                              |   3 +-
 drivers/firmware/efi/libstub/Makefile |   1 +
 include/linux/kasan.h                 |  46 ++++++
 include/linux/sched.h                 |   3 +
 lib/Kconfig.debug                     |   2 +
 lib/Kconfig.kasan                     |  43 +++++
 mm/Makefile                           |   1 +
 mm/kasan/Makefile                     |   8 +
 mm/kasan/kasan.c                      | 302 ++++++++++++++++++++++++++++++++++
 mm/kasan/kasan.h                      |  34 ++++
 mm/kasan/report.c                     | 209 +++++++++++++++++++++++
 scripts/Makefile.kasan                |  24 +++
 scripts/Makefile.lib                  |  10 ++
 14 files changed, 855 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/kasan.txt
 create mode 100644 include/linux/kasan.h
 create mode 100644 lib/Kconfig.kasan
 create mode 100644 mm/kasan/Makefile
 create mode 100644 mm/kasan/kasan.c
 create mode 100644 mm/kasan/kasan.h
 create mode 100644 mm/kasan/report.c
 create mode 100644 scripts/Makefile.kasan

(limited to 'include/linux')

diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt
new file mode 100644
index 000000000000..f0645a8a992f
--- /dev/null
+++ b/Documentation/kasan.txt
@@ -0,0 +1,170 @@
+Kernel address sanitizer
+================
+
+0. Overview
+===========
+
+Kernel Address sanitizer (KASan) is a dynamic memory error detector. It provides
+a fast and comprehensive solution for finding use-after-free and out-of-bounds
+bugs.
+
+KASan uses compile-time instrumentation for checking every memory access,
+therefore you will need a certain version of GCC >= 4.9.2
+
+Currently KASan is supported only for x86_64 architecture and requires that the
+kernel be built with the SLUB allocator.
+
+1. Usage
+=========
+
+To enable KASAN configure kernel with:
+
+	  CONFIG_KASAN = y
+
+and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline/inline
+is compiler instrumentation types. The former produces smaller binary the
+latter is 1.1 - 2 times faster. Inline instrumentation requires GCC 5.0 or
+latter.
+
+Currently KASAN works only with the SLUB memory allocator.
+For better bug detection and nicer report, enable CONFIG_STACKTRACE and put
+at least 'slub_debug=U' in the boot cmdline.
+
+To disable instrumentation for specific files or directories, add a line
+similar to the following to the respective kernel Makefile:
+
+        For a single file (e.g. main.o):
+                KASAN_SANITIZE_main.o := n
+
+        For all files in one directory:
+                KASAN_SANITIZE := n
+
+1.1 Error reports
+==========
+
+A typical out of bounds access report looks like this:
+
+==================================================================
+BUG: AddressSanitizer: out of bounds access in kmalloc_oob_right+0x65/0x75 [test_kasan] at addr ffff8800693bc5d3
+Write of size 1 by task modprobe/1689
+=============================================================================
+BUG kmalloc-128 (Not tainted): kasan error
+-----------------------------------------------------------------------------
+
+Disabling lock debugging due to kernel taint
+INFO: Allocated in kmalloc_oob_right+0x3d/0x75 [test_kasan] age=0 cpu=0 pid=1689
+ __slab_alloc+0x4b4/0x4f0
+ kmem_cache_alloc_trace+0x10b/0x190
+ kmalloc_oob_right+0x3d/0x75 [test_kasan]
+ init_module+0x9/0x47 [test_kasan]
+ do_one_initcall+0x99/0x200
+ load_module+0x2cb3/0x3b20
+ SyS_finit_module+0x76/0x80
+ system_call_fastpath+0x12/0x17
+INFO: Slab 0xffffea0001a4ef00 objects=17 used=7 fp=0xffff8800693bd728 flags=0x100000000004080
+INFO: Object 0xffff8800693bc558 @offset=1368 fp=0xffff8800693bc720
+
+Bytes b4 ffff8800693bc548: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a  ........ZZZZZZZZ
+Object ffff8800693bc558: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc568: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc578: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc588: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc598: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc5a8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc5b8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc5c8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5  kkkkkkkkkkkkkkk.
+Redzone ffff8800693bc5d8: cc cc cc cc cc cc cc cc                          ........
+Padding ffff8800693bc718: 5a 5a 5a 5a 5a 5a 5a 5a                          ZZZZZZZZ
+CPU: 0 PID: 1689 Comm: modprobe Tainted: G    B          3.18.0-rc1-mm1+ #98
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
+ ffff8800693bc000 0000000000000000 ffff8800693bc558 ffff88006923bb78
+ ffffffff81cc68ae 00000000000000f3 ffff88006d407600 ffff88006923bba8
+ ffffffff811fd848 ffff88006d407600 ffffea0001a4ef00 ffff8800693bc558
+Call Trace:
+ [<ffffffff81cc68ae>] dump_stack+0x46/0x58
+ [<ffffffff811fd848>] print_trailer+0xf8/0x160
+ [<ffffffffa00026a7>] ? kmem_cache_oob+0xc3/0xc3 [test_kasan]
+ [<ffffffff811ff0f5>] object_err+0x35/0x40
+ [<ffffffffa0002065>] ? kmalloc_oob_right+0x65/0x75 [test_kasan]
+ [<ffffffff8120b9fa>] kasan_report_error+0x38a/0x3f0
+ [<ffffffff8120a79f>] ? kasan_poison_shadow+0x2f/0x40
+ [<ffffffff8120b344>] ? kasan_unpoison_shadow+0x14/0x40
+ [<ffffffff8120a79f>] ? kasan_poison_shadow+0x2f/0x40
+ [<ffffffffa00026a7>] ? kmem_cache_oob+0xc3/0xc3 [test_kasan]
+ [<ffffffff8120a995>] __asan_store1+0x75/0xb0
+ [<ffffffffa0002601>] ? kmem_cache_oob+0x1d/0xc3 [test_kasan]
+ [<ffffffffa0002065>] ? kmalloc_oob_right+0x65/0x75 [test_kasan]
+ [<ffffffffa0002065>] kmalloc_oob_right+0x65/0x75 [test_kasan]
+ [<ffffffffa00026b0>] init_module+0x9/0x47 [test_kasan]
+ [<ffffffff810002d9>] do_one_initcall+0x99/0x200
+ [<ffffffff811e4e5c>] ? __vunmap+0xec/0x160
+ [<ffffffff81114f63>] load_module+0x2cb3/0x3b20
+ [<ffffffff8110fd70>] ? m_show+0x240/0x240
+ [<ffffffff81115f06>] SyS_finit_module+0x76/0x80
+ [<ffffffff81cd3129>] system_call_fastpath+0x12/0x17
+Memory state around the buggy address:
+ ffff8800693bc300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc380: fc fc 00 00 00 00 00 00 00 00 00 00 00 00 00 fc
+ ffff8800693bc400: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc500: fc fc fc fc fc fc fc fc fc fc fc 00 00 00 00 00
+>ffff8800693bc580: 00 00 00 00 00 00 00 00 00 00 03 fc fc fc fc fc
+                                                 ^
+ ffff8800693bc600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc680: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc700: fc fc fc fc fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff8800693bc780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff8800693bc800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+==================================================================
+
+First sections describe slub object where bad access happened.
+See 'SLUB Debug output' section in Documentation/vm/slub.txt for details.
+
+In the last section the report shows memory state around the accessed address.
+Reading this part requires some more understanding of how KASAN works.
+
+Each 8 bytes of memory are encoded in one shadow byte as accessible,
+partially accessible, freed or they can be part of a redzone.
+We use the following encoding for each shadow byte: 0 means that all 8 bytes
+of the corresponding memory region are accessible; number N (1 <= N <= 7) means
+that the first N bytes are accessible, and other (8 - N) bytes are not;
+any negative value indicates that the entire 8-byte word is inaccessible.
+We use different negative values to distinguish between different kinds of
+inaccessible memory like redzones or freed memory (see mm/kasan/kasan.h).
+
+In the report above the arrows point to the shadow byte 03, which means that
+the accessed address is partially accessible.
+
+
+2. Implementation details
+========================
+
+From a high level, our approach to memory error detection is similar to that
+of kmemcheck: use shadow memory to record whether each byte of memory is safe
+to access, and use compile-time instrumentation to check shadow memory on each
+memory access.
+
+AddressSanitizer dedicates 1/8 of kernel memory to its shadow memory
+(e.g. 16TB to cover 128TB on x86_64) and uses direct mapping with a scale and
+offset to translate a memory address to its corresponding shadow address.
+
+Here is the function witch translate an address to its corresponding shadow
+address:
+
+static inline void *kasan_mem_to_shadow(const void *addr)
+{
+	return ((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
+		+ KASAN_SHADOW_OFFSET;
+}
+
+where KASAN_SHADOW_SCALE_SHIFT = 3.
+
+Compile-time instrumentation used for checking memory accesses. Compiler inserts
+function calls (__asan_load*(addr), __asan_store*(addr)) before each memory
+access of size 1, 2, 4, 8 or 16. These functions check whether memory access is
+valid or not by checking corresponding shadow memory.
+
+GCC 5.0 has possibility to perform inline instrumentation. Instead of making
+function calls GCC directly inserts the code to check the shadow memory.
+This option significantly enlarges kernel but it gives x1.1-x2 performance
+boost over outline instrumented kernel.
diff --git a/Makefile b/Makefile
index 5fa2e3035509..33cb15efd257 100644
--- a/Makefile
+++ b/Makefile
@@ -423,7 +423,7 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
 export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
-export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV
+export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN
 export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
 export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
 export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
@@ -781,6 +781,7 @@ ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
 	KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
 endif
 
+include $(srctree)/scripts/Makefile.kasan
 include $(srctree)/scripts/Makefile.extrawarn
 
 # Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 8902f52e0998..280bc0a63365 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -19,6 +19,7 @@ KBUILD_CFLAGS			:= $(cflags-y) \
 				   $(call cc-option,-fno-stack-protector)
 
 GCOV_PROFILE			:= n
+KASAN_SANITIZE			:= n
 
 lib-y				:= efi-stub-helper.o
 lib-$(CONFIG_EFI_ARMSTUB)	+= arm-stub.o fdt.o
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
new file mode 100644
index 000000000000..9102fda60def
--- /dev/null
+++ b/include/linux/kasan.h
@@ -0,0 +1,46 @@
+#ifndef _LINUX_KASAN_H
+#define _LINUX_KASAN_H
+
+#include <linux/types.h>
+
+struct kmem_cache;
+struct page;
+
+#ifdef CONFIG_KASAN
+
+#define KASAN_SHADOW_SCALE_SHIFT 3
+#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
+
+#include <asm/kasan.h>
+#include <linux/sched.h>
+
+static inline void *kasan_mem_to_shadow(const void *addr)
+{
+	return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
+		+ KASAN_SHADOW_OFFSET;
+}
+
+/* Enable reporting bugs after kasan_disable_current() */
+static inline void kasan_enable_current(void)
+{
+	current->kasan_depth++;
+}
+
+/* Disable reporting bugs for current task */
+static inline void kasan_disable_current(void)
+{
+	current->kasan_depth--;
+}
+
+void kasan_unpoison_shadow(const void *address, size_t size);
+
+#else /* CONFIG_KASAN */
+
+static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
+
+static inline void kasan_enable_current(void) {}
+static inline void kasan_disable_current(void) {}
+
+#endif /* CONFIG_KASAN */
+
+#endif /* LINUX_KASAN_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 048b91b983ed..41c60e5302d7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1664,6 +1664,9 @@ struct task_struct {
 	unsigned long timer_slack_ns;
 	unsigned long default_timer_slack_ns;
 
+#ifdef CONFIG_KASAN
+	unsigned int kasan_depth;
+#endif
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	/* Index of current stored address in ret_stack */
 	int curr_ret_stack;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 79a9bb67aeaf..ecb3516f6546 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -651,6 +651,8 @@ config DEBUG_STACKOVERFLOW
 
 source "lib/Kconfig.kmemcheck"
 
+source "lib/Kconfig.kasan"
+
 endmenu # "Memory Debugging"
 
 config DEBUG_SHIRQ
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
new file mode 100644
index 000000000000..e5b3fbe5560f
--- /dev/null
+++ b/lib/Kconfig.kasan
@@ -0,0 +1,43 @@
+config HAVE_ARCH_KASAN
+	bool
+
+if HAVE_ARCH_KASAN
+
+config KASAN
+	bool "KASan: runtime memory debugger"
+	help
+	  Enables kernel address sanitizer - runtime memory debugger,
+	  designed to find out-of-bounds accesses and use-after-free bugs.
+	  This is strictly debugging feature. It consumes about 1/8
+	  of available memory and brings about ~x3 performance slowdown.
+	  For better error detection enable CONFIG_STACKTRACE,
+	  and add slub_debug=U to boot cmdline.
+
+config KASAN_SHADOW_OFFSET
+	hex
+
+choice
+	prompt "Instrumentation type"
+	depends on KASAN
+	default KASAN_OUTLINE
+
+config KASAN_OUTLINE
+	bool "Outline instrumentation"
+	help
+	  Before every memory access compiler insert function call
+	  __asan_load*/__asan_store*. These functions performs check
+	  of shadow memory. This is slower than inline instrumentation,
+	  however it doesn't bloat size of kernel's .text section so
+	  much as inline does.
+
+config KASAN_INLINE
+	bool "Inline instrumentation"
+	help
+	  Compiler directly inserts code checking shadow memory before
+	  memory accesses. This is faster than outline (in some workloads
+	  it gives about x2 boost over outline instrumentation), but
+	  make kernel's .text size much bigger.
+
+endchoice
+
+endif
diff --git a/mm/Makefile b/mm/Makefile
index 3548460ab7b6..930b52df4aca 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
+obj-$(CONFIG_KASAN)	+= kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
new file mode 100644
index 000000000000..bd837b8c2f41
--- /dev/null
+++ b/mm/kasan/Makefile
@@ -0,0 +1,8 @@
+KASAN_SANITIZE := n
+
+CFLAGS_REMOVE_kasan.o = -pg
+# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
+# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
+CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+
+obj-y := kasan.o report.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
new file mode 100644
index 000000000000..6dc1aa7cefcc
--- /dev/null
+++ b/mm/kasan/kasan.c
@@ -0,0 +1,302 @@
+/*
+ * This file contains shadow memory manipulation code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ *        Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+
+#include "kasan.h"
+
+/*
+ * Poisons the shadow memory for 'size' bytes starting from 'addr'.
+ * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
+ */
+static void kasan_poison_shadow(const void *address, size_t size, u8 value)
+{
+	void *shadow_start, *shadow_end;
+
+	shadow_start = kasan_mem_to_shadow(address);
+	shadow_end = kasan_mem_to_shadow(address + size);
+
+	memset(shadow_start, value, shadow_end - shadow_start);
+}
+
+void kasan_unpoison_shadow(const void *address, size_t size)
+{
+	kasan_poison_shadow(address, size, 0);
+
+	if (size & KASAN_SHADOW_MASK) {
+		u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
+		*shadow = size & KASAN_SHADOW_MASK;
+	}
+}
+
+
+/*
+ * All functions below always inlined so compiler could
+ * perform better optimizations in each of __asan_loadX/__assn_storeX
+ * depending on memory access size X.
+ */
+
+static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+{
+	s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(shadow_value)) {
+		s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+		return unlikely(last_accessible_byte >= shadow_value);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_2(unsigned long addr)
+{
+	u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		if (memory_is_poisoned_1(addr + 1))
+			return true;
+
+		if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
+			return false;
+
+		return unlikely(*(u8 *)shadow_addr);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_4(unsigned long addr)
+{
+	u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		if (memory_is_poisoned_1(addr + 3))
+			return true;
+
+		if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
+			return false;
+
+		return unlikely(*(u8 *)shadow_addr);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_8(unsigned long addr)
+{
+	u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		if (memory_is_poisoned_1(addr + 7))
+			return true;
+
+		if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
+			return false;
+
+		return unlikely(*(u8 *)shadow_addr);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+{
+	u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		u16 shadow_first_bytes = *(u16 *)shadow_addr;
+		s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK;
+
+		if (unlikely(shadow_first_bytes))
+			return true;
+
+		if (likely(!last_byte))
+			return false;
+
+		return memory_is_poisoned_1(addr + 15);
+	}
+
+	return false;
+}
+
+static __always_inline unsigned long bytes_is_zero(const u8 *start,
+					size_t size)
+{
+	while (size) {
+		if (unlikely(*start))
+			return (unsigned long)start;
+		start++;
+		size--;
+	}
+
+	return 0;
+}
+
+static __always_inline unsigned long memory_is_zero(const void *start,
+						const void *end)
+{
+	unsigned int words;
+	unsigned long ret;
+	unsigned int prefix = (unsigned long)start % 8;
+
+	if (end - start <= 16)
+		return bytes_is_zero(start, end - start);
+
+	if (prefix) {
+		prefix = 8 - prefix;
+		ret = bytes_is_zero(start, prefix);
+		if (unlikely(ret))
+			return ret;
+		start += prefix;
+	}
+
+	words = (end - start) / 8;
+	while (words) {
+		if (unlikely(*(u64 *)start))
+			return bytes_is_zero(start, 8);
+		start += 8;
+		words--;
+	}
+
+	return bytes_is_zero(start, (end - start) % 8);
+}
+
+static __always_inline bool memory_is_poisoned_n(unsigned long addr,
+						size_t size)
+{
+	unsigned long ret;
+
+	ret = memory_is_zero(kasan_mem_to_shadow((void *)addr),
+			kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+
+	if (unlikely(ret)) {
+		unsigned long last_byte = addr + size - 1;
+		s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+
+		if (unlikely(ret != (unsigned long)last_shadow ||
+			((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+			return true;
+	}
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+{
+	if (__builtin_constant_p(size)) {
+		switch (size) {
+		case 1:
+			return memory_is_poisoned_1(addr);
+		case 2:
+			return memory_is_poisoned_2(addr);
+		case 4:
+			return memory_is_poisoned_4(addr);
+		case 8:
+			return memory_is_poisoned_8(addr);
+		case 16:
+			return memory_is_poisoned_16(addr);
+		default:
+			BUILD_BUG();
+		}
+	}
+
+	return memory_is_poisoned_n(addr, size);
+}
+
+
+static __always_inline void check_memory_region(unsigned long addr,
+						size_t size, bool write)
+{
+	struct kasan_access_info info;
+
+	if (unlikely(size == 0))
+		return;
+
+	if (unlikely((void *)addr <
+		kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+		info.access_addr = (void *)addr;
+		info.access_size = size;
+		info.is_write = write;
+		info.ip = _RET_IP_;
+		kasan_report_user_access(&info);
+		return;
+	}
+
+	if (likely(!memory_is_poisoned(addr, size)))
+		return;
+
+	kasan_report(addr, size, write, _RET_IP_);
+}
+
+#define DEFINE_ASAN_LOAD_STORE(size)				\
+	void __asan_load##size(unsigned long addr)		\
+	{							\
+		check_memory_region(addr, size, false);		\
+	}							\
+	EXPORT_SYMBOL(__asan_load##size);			\
+	__alias(__asan_load##size)				\
+	void __asan_load##size##_noabort(unsigned long);	\
+	EXPORT_SYMBOL(__asan_load##size##_noabort);		\
+	void __asan_store##size(unsigned long addr)		\
+	{							\
+		check_memory_region(addr, size, true);		\
+	}							\
+	EXPORT_SYMBOL(__asan_store##size);			\
+	__alias(__asan_store##size)				\
+	void __asan_store##size##_noabort(unsigned long);	\
+	EXPORT_SYMBOL(__asan_store##size##_noabort)
+
+DEFINE_ASAN_LOAD_STORE(1);
+DEFINE_ASAN_LOAD_STORE(2);
+DEFINE_ASAN_LOAD_STORE(4);
+DEFINE_ASAN_LOAD_STORE(8);
+DEFINE_ASAN_LOAD_STORE(16);
+
+void __asan_loadN(unsigned long addr, size_t size)
+{
+	check_memory_region(addr, size, false);
+}
+EXPORT_SYMBOL(__asan_loadN);
+
+__alias(__asan_loadN)
+void __asan_loadN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_loadN_noabort);
+
+void __asan_storeN(unsigned long addr, size_t size)
+{
+	check_memory_region(addr, size, true);
+}
+EXPORT_SYMBOL(__asan_storeN);
+
+__alias(__asan_storeN)
+void __asan_storeN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_storeN_noabort);
+
+/* to shut up compiler complaints */
+void __asan_handle_no_return(void) {}
+EXPORT_SYMBOL(__asan_handle_no_return);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
new file mode 100644
index 000000000000..648b9c006f3f
--- /dev/null
+++ b/mm/kasan/kasan.h
@@ -0,0 +1,34 @@
+#ifndef __MM_KASAN_KASAN_H
+#define __MM_KASAN_KASAN_H
+
+#include <linux/kasan.h>
+
+#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
+#define KASAN_SHADOW_MASK       (KASAN_SHADOW_SCALE_SIZE - 1)
+
+struct kasan_access_info {
+	const void *access_addr;
+	const void *first_bad_addr;
+	size_t access_size;
+	bool is_write;
+	unsigned long ip;
+};
+
+void kasan_report_error(struct kasan_access_info *info);
+void kasan_report_user_access(struct kasan_access_info *info);
+
+static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
+{
+	return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
+		<< KASAN_SHADOW_SCALE_SHIFT);
+}
+
+static inline bool kasan_enabled(void)
+{
+	return !current->kasan_depth;
+}
+
+void kasan_report(unsigned long addr, size_t size,
+		bool is_write, unsigned long ip);
+
+#endif
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
new file mode 100644
index 000000000000..5835d69563f5
--- /dev/null
+++ b/mm/kasan/report.c
@@ -0,0 +1,209 @@
+/*
+ * This file contains error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ *        Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+
+#include "kasan.h"
+
+/* Shadow layout customization. */
+#define SHADOW_BYTES_PER_BLOCK 1
+#define SHADOW_BLOCKS_PER_ROW 16
+#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
+#define SHADOW_ROWS_AROUND_ADDR 2
+
+static const void *find_first_bad_addr(const void *addr, size_t size)
+{
+	u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr);
+	const void *first_bad_addr = addr;
+
+	while (!shadow_val && first_bad_addr < addr + size) {
+		first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
+		shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr);
+	}
+	return first_bad_addr;
+}
+
+static void print_error_description(struct kasan_access_info *info)
+{
+	const char *bug_type = "unknown crash";
+	u8 shadow_val;
+
+	info->first_bad_addr = find_first_bad_addr(info->access_addr,
+						info->access_size);
+
+	shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+
+	switch (shadow_val) {
+	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+		bug_type = "out of bounds access";
+		break;
+	}
+
+	pr_err("BUG: KASan: %s in %pS at addr %p\n",
+		bug_type, (void *)info->ip,
+		info->access_addr);
+	pr_err("%s of size %zu by task %s/%d\n",
+		info->is_write ? "Write" : "Read",
+		info->access_size, current->comm, task_pid_nr(current));
+}
+
+static void print_address_description(struct kasan_access_info *info)
+{
+	dump_stack();
+}
+
+static bool row_is_guilty(const void *row, const void *guilty)
+{
+	return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW);
+}
+
+static int shadow_pointer_offset(const void *row, const void *shadow)
+{
+	/* The length of ">ff00ff00ff00ff00: " is
+	 *    3 + (BITS_PER_LONG/8)*2 chars.
+	 */
+	return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 +
+		(shadow - row) / SHADOW_BYTES_PER_BLOCK + 1;
+}
+
+static void print_shadow_for_address(const void *addr)
+{
+	int i;
+	const void *shadow = kasan_mem_to_shadow(addr);
+	const void *shadow_row;
+
+	shadow_row = (void *)round_down((unsigned long)shadow,
+					SHADOW_BYTES_PER_ROW)
+		- SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW;
+
+	pr_err("Memory state around the buggy address:\n");
+
+	for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
+		const void *kaddr = kasan_shadow_to_mem(shadow_row);
+		char buffer[4 + (BITS_PER_LONG/8)*2];
+
+		snprintf(buffer, sizeof(buffer),
+			(i == 0) ? ">%p: " : " %p: ", kaddr);
+
+		kasan_disable_current();
+		print_hex_dump(KERN_ERR, buffer,
+			DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
+			shadow_row, SHADOW_BYTES_PER_ROW, 0);
+		kasan_enable_current();
+
+		if (row_is_guilty(shadow_row, shadow))
+			pr_err("%*c\n",
+				shadow_pointer_offset(shadow_row, shadow),
+				'^');
+
+		shadow_row += SHADOW_BYTES_PER_ROW;
+	}
+}
+
+static DEFINE_SPINLOCK(report_lock);
+
+void kasan_report_error(struct kasan_access_info *info)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&report_lock, flags);
+	pr_err("================================="
+		"=================================\n");
+	print_error_description(info);
+	print_address_description(info);
+	print_shadow_for_address(info->first_bad_addr);
+	pr_err("================================="
+		"=================================\n");
+	spin_unlock_irqrestore(&report_lock, flags);
+}
+
+void kasan_report_user_access(struct kasan_access_info *info)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&report_lock, flags);
+	pr_err("================================="
+		"=================================\n");
+	pr_err("BUG: KASan: user-memory-access on address %p\n",
+		info->access_addr);
+	pr_err("%s of size %zu by task %s/%d\n",
+		info->is_write ? "Write" : "Read",
+		info->access_size, current->comm, task_pid_nr(current));
+	dump_stack();
+	pr_err("================================="
+		"=================================\n");
+	spin_unlock_irqrestore(&report_lock, flags);
+}
+
+void kasan_report(unsigned long addr, size_t size,
+		bool is_write, unsigned long ip)
+{
+	struct kasan_access_info info;
+
+	if (likely(!kasan_enabled()))
+		return;
+
+	info.access_addr = (void *)addr;
+	info.access_size = size;
+	info.is_write = is_write;
+	info.ip = ip;
+	kasan_report_error(&info);
+}
+
+
+#define DEFINE_ASAN_REPORT_LOAD(size)                     \
+void __asan_report_load##size##_noabort(unsigned long addr) \
+{                                                         \
+	kasan_report(addr, size, false, _RET_IP_);	  \
+}                                                         \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+
+#define DEFINE_ASAN_REPORT_STORE(size)                     \
+void __asan_report_store##size##_noabort(unsigned long addr) \
+{                                                          \
+	kasan_report(addr, size, true, _RET_IP_);	   \
+}                                                          \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+
+void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+{
+	kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+
+void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+{
+	kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
new file mode 100644
index 000000000000..7acd6faa0335
--- /dev/null
+++ b/scripts/Makefile.kasan
@@ -0,0 +1,24 @@
+ifdef CONFIG_KASAN
+ifdef CONFIG_KASAN_INLINE
+	call_threshold := 10000
+else
+	call_threshold := 0
+endif
+
+CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address
+
+CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \
+		-fasan-shadow-offset=$(CONFIG_KASAN_SHADOW_OFFSET) \
+		--param asan-instrumentation-with-call-threshold=$(call_threshold))
+
+ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),)
+        $(warning Cannot use CONFIG_KASAN: \
+            -fsanitize=kernel-address is not supported by compiler)
+else
+    ifeq ($(CFLAGS_KASAN),)
+        $(warning CONFIG_KASAN: compiler does not support all options.\
+            Trying minimal configuration)
+        CFLAGS_KASAN := $(CFLAGS_KASAN_MINIMAL)
+    endif
+endif
+endif
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 511755200634..044eb4f89a91 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -119,6 +119,16 @@ _c_flags += $(if $(patsubst n%,, \
 		$(CFLAGS_GCOV))
 endif
 
+#
+# Enable address sanitizer flags for kernel except some files or directories
+# we don't want to check (depends on variables KASAN_SANITIZE_obj.o, KASAN_SANITIZE)
+#
+ifeq ($(CONFIG_KASAN),y)
+_c_flags += $(if $(patsubst n%,, \
+		$(KASAN_SANITIZE_$(basetarget).o)$(KASAN_SANITIZE)y), \
+		$(CFLAGS_KASAN))
+endif
+
 # If building the kernel in a separate objtree expand all occurrences
 # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/').
 
-- 
cgit v1.2.3


From b8c73fc2493d42517be95cf2c89659fc6c6f4d02 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:28 -0800
Subject: mm: page_alloc: add kasan hooks on alloc and free paths

Add kernel address sanitizer hooks to mark allocated page's addresses as
accessible in corresponding shadow region.  Mark freed pages as
inaccessible.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  6 ++++++
 mm/compaction.c       |  2 ++
 mm/kasan/kasan.c      | 14 ++++++++++++++
 mm/kasan/kasan.h      |  2 ++
 mm/kasan/report.c     | 11 +++++++++++
 mm/page_alloc.c       |  3 +++
 6 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 9102fda60def..f00c15c41235 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -34,6 +34,9 @@ static inline void kasan_disable_current(void)
 
 void kasan_unpoison_shadow(const void *address, size_t size);
 
+void kasan_alloc_pages(struct page *page, unsigned int order);
+void kasan_free_pages(struct page *page, unsigned int order);
+
 #else /* CONFIG_KASAN */
 
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
@@ -41,6 +44,9 @@ static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
 static inline void kasan_enable_current(void) {}
 static inline void kasan_disable_current(void) {}
 
+static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
+static inline void kasan_free_pages(struct page *page, unsigned int order) {}
+
 #endif /* CONFIG_KASAN */
 
 #endif /* LINUX_KASAN_H */
diff --git a/mm/compaction.c b/mm/compaction.c
index d50d6de6f1b6..8c0d9459b54a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,7 @@
 #include <linux/sysfs.h>
 #include <linux/balloon_compaction.h>
 #include <linux/page-isolation.h>
+#include <linux/kasan.h>
 #include "internal.h"
 
 #ifdef CONFIG_COMPACTION
@@ -72,6 +73,7 @@ static void map_pages(struct list_head *list)
 	list_for_each_entry(page, list, lru) {
 		arch_alloc_page(page, 0);
 		kernel_map_pages(page, 1, 1);
+		kasan_alloc_pages(page, 0);
 	}
 }
 
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index def81104772f..b516eb8632b9 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -254,6 +254,20 @@ static __always_inline void check_memory_region(unsigned long addr,
 	kasan_report(addr, size, write, _RET_IP_);
 }
 
+void kasan_alloc_pages(struct page *page, unsigned int order)
+{
+	if (likely(!PageHighMem(page)))
+		kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+}
+
+void kasan_free_pages(struct page *page, unsigned int order)
+{
+	if (likely(!PageHighMem(page)))
+		kasan_poison_shadow(page_address(page),
+				PAGE_SIZE << order,
+				KASAN_FREE_PAGE);
+}
+
 #define DEFINE_ASAN_LOAD_STORE(size)				\
 	void __asan_load##size(unsigned long addr)		\
 	{							\
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 648b9c006f3f..d3c90d5dd97a 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -6,6 +6,8 @@
 #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
 #define KASAN_SHADOW_MASK       (KASAN_SHADOW_SCALE_SIZE - 1)
 
+#define KASAN_FREE_PAGE         0xFF  /* page was freed */
+
 struct kasan_access_info {
 	const void *access_addr;
 	const void *first_bad_addr;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 5835d69563f5..fab8e7882ff1 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -54,6 +54,9 @@ static void print_error_description(struct kasan_access_info *info)
 	shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
 
 	switch (shadow_val) {
+	case KASAN_FREE_PAGE:
+		bug_type = "use after free";
+		break;
 	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
 		bug_type = "out of bounds access";
 		break;
@@ -69,6 +72,14 @@ static void print_error_description(struct kasan_access_info *info)
 
 static void print_address_description(struct kasan_access_info *info)
 {
+	const void *addr = info->access_addr;
+
+	if ((addr >= (void *)PAGE_OFFSET) &&
+		(addr < high_memory)) {
+		struct page *page = virt_to_head_page(addr);
+		dump_page(page, "kasan: bad access detected");
+	}
+
 	dump_stack();
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cb4758263f6b..a47f0b229a1a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,6 +25,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
+#include <linux/kasan.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
@@ -787,6 +788,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
+	kasan_free_pages(page, order);
 
 	if (PageAnon(page))
 		page->mapping = NULL;
@@ -970,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
 
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
+	kasan_alloc_pages(page, order);
 
 	if (gfp_flags & __GFP_ZERO)
 		prep_zero_page(page, order, gfp_flags);
-- 
cgit v1.2.3


From 912f5fbf1d3060f25d6994aed0265c55b974b2e9 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:31 -0800
Subject: mm: slub: introduce virt_to_obj function

virt_to_obj takes kmem_cache address, address of slab page, address x
pointing somewhere inside slab object, and returns address of the
beginning of object.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 9abf04ed0999..db7d5de00c5f 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -110,4 +110,20 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
 }
 #endif
 
+
+/**
+ * virt_to_obj - returns address of the beginning of object.
+ * @s: object's kmem_cache
+ * @slab_page: address of slab page
+ * @x: address within object memory range
+ *
+ * Returns address of the beginning of object
+ */
+static inline void *virt_to_obj(struct kmem_cache *s,
+				const void *slab_page,
+				const void *x)
+{
+	return (void *)x - ((x - slab_page) % s->size);
+}
+
 #endif /* _LINUX_SLUB_DEF_H */
-- 
cgit v1.2.3


From 75c66def8d815201aa0386ecc7c66a5c8dbca1ee Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:35 -0800
Subject: mm: slub: share object_err function

Remove static and add function declarations to linux/slub_def.h so it
could be used by kernel address sanitizer.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h | 3 +++
 mm/slub.c                | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index db7d5de00c5f..33885118523c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -126,4 +126,7 @@ static inline void *virt_to_obj(struct kmem_cache *s,
 	return (void *)x - ((x - slab_page) % s->size);
 }
 
+void object_err(struct kmem_cache *s, struct page *page,
+		u8 *object, char *reason);
+
 #endif /* _LINUX_SLUB_DEF_H */
diff --git a/mm/slub.c b/mm/slub.c
index 783505ba2052..6833b73ef6b3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -629,7 +629,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 	dump_stack();
 }
 
-static void object_err(struct kmem_cache *s, struct page *page,
+void object_err(struct kmem_cache *s, struct page *page,
 			u8 *object, char *reason)
 {
 	slab_bug(s, "%s", reason);
-- 
cgit v1.2.3


From 0316bec22ec95ea2faca6406437b0b5950553b7c Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:42 -0800
Subject: mm: slub: add kernel address sanitizer support for slub allocator

With this patch kasan will be able to catch bugs in memory allocated by
slub.  Initially all objects in newly allocated slab page, marked as
redzone.  Later, when allocation of slub object happens, requested by
caller number of bytes marked as accessible, and the rest of the object
(including slub's metadata) marked as redzone (inaccessible).

We also mark object as accessible if ksize was called for this object.
There is some places in kernel where ksize function is called to inquire
size of really allocated area.  Such callers could validly access whole
allocated memory, so it should be marked as accessible.

Code in slub.c and slab_common.c files could validly access to object's
metadata, so instrumentation for this files are disabled.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Signed-off-by: Dmitry Chernenkov <dmitryc@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 27 ++++++++++++++
 include/linux/slab.h  | 11 ++++--
 lib/Kconfig.kasan     |  1 +
 mm/Makefile           |  3 ++
 mm/kasan/kasan.c      | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/kasan/kasan.h      |  5 +++
 mm/kasan/report.c     | 21 +++++++++++
 mm/slab_common.c      |  5 ++-
 mm/slub.c             | 31 ++++++++++++++--
 9 files changed, 197 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index f00c15c41235..d5310eef3e38 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -37,6 +37,18 @@ void kasan_unpoison_shadow(const void *address, size_t size);
 void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
 
+void kasan_poison_slab(struct page *page);
+void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
+void kasan_poison_object_data(struct kmem_cache *cache, void *object);
+
+void kasan_kmalloc_large(const void *ptr, size_t size);
+void kasan_kfree_large(const void *ptr);
+void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size);
+void kasan_krealloc(const void *object, size_t new_size);
+
+void kasan_slab_alloc(struct kmem_cache *s, void *object);
+void kasan_slab_free(struct kmem_cache *s, void *object);
+
 #else /* CONFIG_KASAN */
 
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
@@ -47,6 +59,21 @@ static inline void kasan_disable_current(void) {}
 static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
 static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 
+static inline void kasan_poison_slab(struct page *page) {}
+static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
+					void *object) {}
+static inline void kasan_poison_object_data(struct kmem_cache *cache,
+					void *object) {}
+
+static inline void kasan_kmalloc_large(void *ptr, size_t size) {}
+static inline void kasan_kfree_large(const void *ptr) {}
+static inline void kasan_kmalloc(struct kmem_cache *s, const void *object,
+				size_t size) {}
+static inline void kasan_krealloc(const void *object, size_t new_size) {}
+
+static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {}
+static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
+
 #endif /* CONFIG_KASAN */
 
 #endif /* LINUX_KASAN_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ed2ffaab59ea..76f1feeabd38 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -104,6 +104,7 @@
 				(unsigned long)ZERO_SIZE_PTR)
 
 #include <linux/kmemleak.h>
+#include <linux/kasan.h>
 
 struct mem_cgroup;
 /*
@@ -325,7 +326,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
 		gfp_t flags, size_t size)
 {
-	return kmem_cache_alloc(s, flags);
+	void *ret = kmem_cache_alloc(s, flags);
+
+	kasan_kmalloc(s, ret, size);
+	return ret;
 }
 
 static __always_inline void *
@@ -333,7 +337,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 			      gfp_t gfpflags,
 			      int node, size_t size)
 {
-	return kmem_cache_alloc_node(s, gfpflags, node);
+	void *ret = kmem_cache_alloc_node(s, gfpflags, node);
+
+	kasan_kmalloc(s, ret, size);
+	return ret;
 }
 #endif /* CONFIG_TRACING */
 
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 0052b1b9aadd..a11ac0234452 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -5,6 +5,7 @@ if HAVE_ARCH_KASAN
 
 config KASAN
 	bool "KASan: runtime memory debugger"
+	depends on SLUB_DEBUG
 	help
 	  Enables kernel address sanitizer - runtime memory debugger,
 	  designed to find out-of-bounds accesses and use-after-free bugs.
diff --git a/mm/Makefile b/mm/Makefile
index 930b52df4aca..088c68e9ec35 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -2,6 +2,9 @@
 # Makefile for the linux memory manager.
 #
 
+KASAN_SANITIZE_slab_common.o := n
+KASAN_SANITIZE_slub.o := n
+
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index b516eb8632b9..dc83f070edb6 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -31,6 +31,7 @@
 #include <linux/kasan.h>
 
 #include "kasan.h"
+#include "../slab.h"
 
 /*
  * Poisons the shadow memory for 'size' bytes starting from 'addr'.
@@ -268,6 +269,103 @@ void kasan_free_pages(struct page *page, unsigned int order)
 				KASAN_FREE_PAGE);
 }
 
+void kasan_poison_slab(struct page *page)
+{
+	kasan_poison_shadow(page_address(page),
+			PAGE_SIZE << compound_order(page),
+			KASAN_KMALLOC_REDZONE);
+}
+
+void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+{
+	kasan_unpoison_shadow(object, cache->object_size);
+}
+
+void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+{
+	kasan_poison_shadow(object,
+			round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
+			KASAN_KMALLOC_REDZONE);
+}
+
+void kasan_slab_alloc(struct kmem_cache *cache, void *object)
+{
+	kasan_kmalloc(cache, object, cache->object_size);
+}
+
+void kasan_slab_free(struct kmem_cache *cache, void *object)
+{
+	unsigned long size = cache->object_size;
+	unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+
+	/* RCU slabs could be legally used after free within the RCU period */
+	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+		return;
+
+	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+
+void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
+{
+	unsigned long redzone_start;
+	unsigned long redzone_end;
+
+	if (unlikely(object == NULL))
+		return;
+
+	redzone_start = round_up((unsigned long)(object + size),
+				KASAN_SHADOW_SCALE_SIZE);
+	redzone_end = round_up((unsigned long)object + cache->object_size,
+				KASAN_SHADOW_SCALE_SIZE);
+
+	kasan_unpoison_shadow(object, size);
+	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+		KASAN_KMALLOC_REDZONE);
+}
+EXPORT_SYMBOL(kasan_kmalloc);
+
+void kasan_kmalloc_large(const void *ptr, size_t size)
+{
+	struct page *page;
+	unsigned long redzone_start;
+	unsigned long redzone_end;
+
+	if (unlikely(ptr == NULL))
+		return;
+
+	page = virt_to_page(ptr);
+	redzone_start = round_up((unsigned long)(ptr + size),
+				KASAN_SHADOW_SCALE_SIZE);
+	redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
+
+	kasan_unpoison_shadow(ptr, size);
+	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+		KASAN_PAGE_REDZONE);
+}
+
+void kasan_krealloc(const void *object, size_t size)
+{
+	struct page *page;
+
+	if (unlikely(object == ZERO_SIZE_PTR))
+		return;
+
+	page = virt_to_head_page(object);
+
+	if (unlikely(!PageSlab(page)))
+		kasan_kmalloc_large(object, size);
+	else
+		kasan_kmalloc(page->slab_cache, object, size);
+}
+
+void kasan_kfree_large(const void *ptr)
+{
+	struct page *page = virt_to_page(ptr);
+
+	kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+			KASAN_FREE_PAGE);
+}
+
 #define DEFINE_ASAN_LOAD_STORE(size)				\
 	void __asan_load##size(unsigned long addr)		\
 	{							\
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index d3c90d5dd97a..5b052ab40cf9 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -7,6 +7,11 @@
 #define KASAN_SHADOW_MASK       (KASAN_SHADOW_SCALE_SIZE - 1)
 
 #define KASAN_FREE_PAGE         0xFF  /* page was freed */
+#define KASAN_FREE_PAGE         0xFF  /* page was freed */
+#define KASAN_PAGE_REDZONE      0xFE  /* redzone for kmalloc_large allocations */
+#define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
+#define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
+
 
 struct kasan_access_info {
 	const void *access_addr;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index fab8e7882ff1..2760edb4d0a8 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -24,6 +24,7 @@
 #include <linux/kasan.h>
 
 #include "kasan.h"
+#include "../slab.h"
 
 /* Shadow layout customization. */
 #define SHADOW_BYTES_PER_BLOCK 1
@@ -55,8 +56,11 @@ static void print_error_description(struct kasan_access_info *info)
 
 	switch (shadow_val) {
 	case KASAN_FREE_PAGE:
+	case KASAN_KMALLOC_FREE:
 		bug_type = "use after free";
 		break;
+	case KASAN_PAGE_REDZONE:
+	case KASAN_KMALLOC_REDZONE:
 	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
 		bug_type = "out of bounds access";
 		break;
@@ -77,6 +81,23 @@ static void print_address_description(struct kasan_access_info *info)
 	if ((addr >= (void *)PAGE_OFFSET) &&
 		(addr < high_memory)) {
 		struct page *page = virt_to_head_page(addr);
+
+		if (PageSlab(page)) {
+			void *object;
+			struct kmem_cache *cache = page->slab_cache;
+			void *last_object;
+
+			object = virt_to_obj(cache, page_address(page), addr);
+			last_object = page_address(page) +
+				page->objects * cache->size;
+
+			if (unlikely(object > last_object))
+				object = last_object; /* we hit into padding */
+
+			object_err(cache, page, object,
+				"kasan: bad access detected");
+			return;
+		}
 		dump_page(page, "kasan: bad access detected");
 	}
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 429a4506b382..999bb3424d44 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -898,6 +898,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
 	page = alloc_kmem_pages(flags, order);
 	ret = page ? page_address(page) : NULL;
 	kmemleak_alloc(ret, size, 1, flags);
+	kasan_kmalloc_large(ret, size);
 	return ret;
 }
 EXPORT_SYMBOL(kmalloc_order);
@@ -1077,8 +1078,10 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
 	if (p)
 		ks = ksize(p);
 
-	if (ks >= new_size)
+	if (ks >= new_size) {
+		kasan_krealloc((void *)p, new_size);
 		return (void *)p;
+	}
 
 	ret = kmalloc_track_caller(new_size, flags);
 	if (ret && p)
diff --git a/mm/slub.c b/mm/slub.c
index 37555ad8894d..6832c4eab104 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1251,11 +1251,13 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
 static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
 {
 	kmemleak_alloc(ptr, size, 1, flags);
+	kasan_kmalloc_large(ptr, size);
 }
 
 static inline void kfree_hook(const void *x)
 {
 	kmemleak_free(x);
+	kasan_kfree_large(x);
 }
 
 static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
@@ -1278,6 +1280,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
 	kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
 	kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
 	memcg_kmem_put_cache(s);
+	kasan_slab_alloc(s, object);
 }
 
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -1301,6 +1304,8 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
 #endif
 	if (!(s->flags & SLAB_DEBUG_OBJECTS))
 		debug_check_no_obj_freed(x, s->object_size);
+
+	kasan_slab_free(s, x);
 }
 
 /*
@@ -1395,8 +1400,11 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 				void *object)
 {
 	setup_object_debug(s, page, object);
-	if (unlikely(s->ctor))
+	if (unlikely(s->ctor)) {
+		kasan_unpoison_object_data(s, object);
 		s->ctor(object);
+		kasan_poison_object_data(s, object);
+	}
 }
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1429,6 +1437,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	if (unlikely(s->flags & SLAB_POISON))
 		memset(start, POISON_INUSE, PAGE_SIZE << order);
 
+	kasan_poison_slab(page);
+
 	for_each_object_idx(p, idx, s, start, page->objects) {
 		setup_object(s, page, p);
 		if (likely(idx < page->objects))
@@ -2522,6 +2532,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
 	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
 	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+	kasan_kmalloc(s, ret, size);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2548,6 +2559,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, s->size, gfpflags, node);
+
+	kasan_kmalloc(s, ret, size);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -2933,6 +2946,7 @@ static void early_kmem_cache_node_alloc(int node)
 	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
 	init_tracking(kmem_cache_node, n);
 #endif
+	kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
 	init_kmem_cache_node(n);
 	inc_slabs_node(kmem_cache_node, node, page->objects);
 
@@ -3305,6 +3319,8 @@ void *__kmalloc(size_t size, gfp_t flags)
 
 	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
 
+	kasan_kmalloc(s, ret, size);
+
 	return ret;
 }
 EXPORT_SYMBOL(__kmalloc);
@@ -3348,12 +3364,14 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 
 	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
 
+	kasan_kmalloc(s, ret, size);
+
 	return ret;
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif
 
-size_t ksize(const void *object)
+static size_t __ksize(const void *object)
 {
 	struct page *page;
 
@@ -3369,6 +3387,15 @@ size_t ksize(const void *object)
 
 	return slab_ksize(page->slab_cache);
 }
+
+size_t ksize(const void *object)
+{
+	size_t size = __ksize(object);
+	/* We assume that ksize callers could use whole allocated area,
+	   so we need unpoison this area. */
+	kasan_krealloc(object, size);
+	return size;
+}
 EXPORT_SYMBOL(ksize);
 
 void kfree(const void *x)
-- 
cgit v1.2.3


From c420f167db8c799d69fe43a801c58a7f02e9d57c Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:59 -0800
Subject: kasan: enable stack instrumentation

Stack instrumentation allows to detect out of bounds memory accesses for
variables allocated on stack.  Compiler adds redzones around every
variable on stack and poisons redzones in function's prologue.

Such approach significantly increases stack usage, so all in-kernel stacks
size were doubled.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/page_64_types.h | 12 +++++++++---
 arch/x86/kernel/Makefile             |  2 ++
 arch/x86/mm/kasan_init_64.c          | 11 +++++++++--
 include/linux/init_task.h            |  8 ++++++++
 mm/kasan/kasan.h                     |  9 +++++++++
 mm/kasan/report.c                    |  6 ++++++
 scripts/Makefile.kasan               |  1 +
 7 files changed, 44 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 75450b2c7be4..4edd53b79a81 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,17 +1,23 @@
 #ifndef _ASM_X86_PAGE_64_DEFS_H
 #define _ASM_X86_PAGE_64_DEFS_H
 
-#define THREAD_SIZE_ORDER	2
+#ifdef CONFIG_KASAN
+#define KASAN_STACK_ORDER 1
+#else
+#define KASAN_STACK_ORDER 0
+#endif
+
+#define THREAD_SIZE_ORDER	(2 + KASAN_STACK_ORDER)
 #define THREAD_SIZE  (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define CURRENT_MASK (~(THREAD_SIZE - 1))
 
-#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
 #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
 
 #define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
 #define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
 
-#define IRQ_STACK_ORDER 2
+#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
 
 #define DOUBLEFAULT_STACK 1
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b13b70634124..cdb1b70ddad0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -17,6 +17,8 @@ CFLAGS_REMOVE_early_printk.o = -pg
 endif
 
 KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
 
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 3e4d9a1a39fa..53508708b7aa 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -189,11 +189,18 @@ void __init kasan_init(void)
 		if (map_range(&pfn_mapped[i]))
 			panic("kasan: unable to allocate shadow!");
 	}
-
 	populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
-				(void *)KASAN_SHADOW_END);
+			kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+	vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
+			(unsigned long)kasan_mem_to_shadow(_end),
+			NUMA_NO_NODE);
+
+	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_VADDR),
+			(void *)KASAN_SHADOW_END);
 
 	memset(kasan_zero_page, 0, PAGE_SIZE);
 
 	load_cr3(init_level4_pgt);
+	init_task.kasan_depth = 0;
 }
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d3d43ecf148c..696d22312b31 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -175,6 +175,13 @@ extern struct task_group root_task_group;
 # define INIT_NUMA_BALANCING(tsk)
 #endif
 
+#ifdef CONFIG_KASAN
+# define INIT_KASAN(tsk)						\
+	.kasan_depth = 1,
+#else
+# define INIT_KASAN(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -250,6 +257,7 @@ extern struct task_group root_task_group;
 	INIT_RT_MUTEXES(tsk)						\
 	INIT_VTIME(tsk)							\
 	INIT_NUMA_BALANCING(tsk)					\
+	INIT_KASAN(tsk)							\
 }
 
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 5b052ab40cf9..1fcc1d81a9cf 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -12,6 +12,15 @@
 #define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
 #define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
 
+/*
+ * Stack redzone shadow values
+ * (Those are compiler's ABI, don't change them)
+ */
+#define KASAN_STACK_LEFT        0xF1
+#define KASAN_STACK_MID         0xF2
+#define KASAN_STACK_RIGHT       0xF3
+#define KASAN_STACK_PARTIAL     0xF4
+
 
 struct kasan_access_info {
 	const void *access_addr;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 2760edb4d0a8..866732ef3db3 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -64,6 +64,12 @@ static void print_error_description(struct kasan_access_info *info)
 	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
 		bug_type = "out of bounds access";
 		break;
+	case KASAN_STACK_LEFT:
+	case KASAN_STACK_MID:
+	case KASAN_STACK_RIGHT:
+	case KASAN_STACK_PARTIAL:
+		bug_type = "out of bounds on stack";
+		break;
 	}
 
 	pr_err("BUG: KASan: %s in %pS at addr %p\n",
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 7acd6faa0335..2163b8cc446e 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -9,6 +9,7 @@ CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address
 
 CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \
 		-fasan-shadow-offset=$(CONFIG_KASAN_SHADOW_OFFSET) \
+		--param asan-stack=1 \
 		--param asan-instrumentation-with-call-threshold=$(call_threshold))
 
 ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),)
-- 
cgit v1.2.3


From 71394fe50146202f2c8d92cf50f5ebc761acf254 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:40:03 -0800
Subject: mm: vmalloc: add flag preventing guard hole allocation

For instrumenting global variables KASan will shadow memory backing memory
for modules.  So on module loading we will need to allocate memory for
shadow and map it at address in shadow that corresponds to the address
allocated in module_alloc().

__vmalloc_node_range() could be used for this purpose, except it puts a
guard hole after allocated area.  Guard hole in shadow memory should be a
problem because at some future point we might need to have a shadow memory
at address occupied by guard hole.  So we could fail to allocate shadow
for module_alloc().

Add a new vm_struct flag 'VM_NO_GUARD' indicating that vm area doesn't
have a guard hole.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 9 +++++++--
 mm/vmalloc.c            | 6 ++----
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index b87696fdf06a..1526fe712ca0 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -16,6 +16,7 @@ struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 #define VM_USERMAP		0x00000008	/* suitable for remap_vmalloc_range */
 #define VM_VPAGES		0x00000010	/* buffer for pages was vmalloc'ed */
 #define VM_UNINITIALIZED	0x00000020	/* vm_struct is not fully initialized */
+#define VM_NO_GUARD		0x00000040      /* don't add guard page */
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
@@ -96,8 +97,12 @@ void vmalloc_sync_all(void);
 
 static inline size_t get_vm_area_size(const struct vm_struct *area)
 {
-	/* return actual size without guard page */
-	return area->size - PAGE_SIZE;
+	if (!(area->flags & VM_NO_GUARD))
+		/* return actual size without guard page */
+		return area->size - PAGE_SIZE;
+	else
+		return area->size;
+
 }
 
 extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 39c338896416..2e74e99d4cfe 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1324,10 +1324,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 	if (unlikely(!area))
 		return NULL;
 
-	/*
-	 * We always allocate a guard page.
-	 */
-	size += PAGE_SIZE;
+	if (!(flags & VM_NO_GUARD))
+		size += PAGE_SIZE;
 
 	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
 	if (IS_ERR(va)) {
-- 
cgit v1.2.3


From cb9e3c292d0115499c660028ad35ac5501d722b5 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:40:07 -0800
Subject: mm: vmalloc: pass additional vm_flags to __vmalloc_node_range()

For instrumenting global variables KASan will shadow memory backing memory
for modules.  So on module loading we will need to allocate memory for
shadow and map it at address in shadow that corresponds to the address
allocated in module_alloc().

__vmalloc_node_range() could be used for this purpose, except it puts a
guard hole after allocated area.  Guard hole in shadow memory should be a
problem because at some future point we might need to have a shadow memory
at address occupied by guard hole.  So we could fail to allocate shadow
for module_alloc().

Now we have VM_NO_GUARD flag disabling guard page, so we need to pass into
__vmalloc_node_range().  Add new parameter 'vm_flags' to
__vmalloc_node_range() function.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/module.c       |  2 +-
 arch/arm64/kernel/module.c     |  4 ++--
 arch/mips/kernel/module.c      |  2 +-
 arch/parisc/kernel/module.c    |  2 +-
 arch/s390/kernel/module.c      |  2 +-
 arch/sparc/kernel/module.c     |  2 +-
 arch/unicore32/kernel/module.c |  2 +-
 arch/x86/kernel/module.c       |  2 +-
 include/linux/vmalloc.h        |  4 +++-
 mm/vmalloc.c                   | 10 ++++++----
 10 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index bea7db9e5b80..2e11961f65ae 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -41,7 +41,7 @@
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #endif
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 9b6f71db2709..67bf4107f6ef 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -35,8 +35,8 @@
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
-				    __builtin_return_address(0));
+				    GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
+				    NUMA_NO_NODE, __builtin_return_address(0));
 }
 
 enum aarch64_reloc_op {
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 2a52568dbcd6..1833f5171ccd 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -47,7 +47,7 @@ static DEFINE_SPINLOCK(dbe_lock);
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
-				GFP_KERNEL, PAGE_KERNEL, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #endif
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index 5822e8e200e6..3c63a820fcda 100644
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -219,7 +219,7 @@ void *module_alloc(unsigned long size)
 	 * init_data correctly */
 	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
 				    GFP_KERNEL | __GFP_HIGHMEM,
-				    PAGE_KERNEL_RWX, NUMA_NO_NODE,
+				    PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 409d152585be..36154a2f1814 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -50,7 +50,7 @@ void *module_alloc(unsigned long size)
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL, NUMA_NO_NODE,
+				    GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 #endif
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index 97655e0fd243..192a617a32f3 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -29,7 +29,7 @@ static void *module_map(unsigned long size)
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #else
diff --git a/arch/unicore32/kernel/module.c b/arch/unicore32/kernel/module.c
index dc41f6dfedb6..e191b3448bd3 100644
--- a/arch/unicore32/kernel/module.c
+++ b/arch/unicore32/kernel/module.c
@@ -25,7 +25,7 @@
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e69f9882bf95..e830e61aae05 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -88,7 +88,7 @@ void *module_alloc(unsigned long size)
 	return __vmalloc_node_range(size, 1,
 				    MODULES_VADDR + get_module_load_offset(),
 				    MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
-				    PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+				    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 1526fe712ca0..7d7acb35603d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -76,7 +76,9 @@ extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
 extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
-			pgprot_t prot, int node, const void *caller);
+			pgprot_t prot, unsigned long vm_flags, int node,
+			const void *caller);
+
 extern void vfree(const void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2e74e99d4cfe..35b25e1340ca 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1619,6 +1619,7 @@ fail:
  *	@end:		vm area range end
  *	@gfp_mask:	flags for the page level allocator
  *	@prot:		protection mask for the allocated pages
+ *	@vm_flags:	additional vm area flags (e.g. %VM_NO_GUARD)
  *	@node:		node to use for allocation or NUMA_NO_NODE
  *	@caller:	caller's return address
  *
@@ -1628,7 +1629,8 @@ fail:
  */
 void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
-			pgprot_t prot, int node, const void *caller)
+			pgprot_t prot, unsigned long vm_flags, int node,
+			const void *caller)
 {
 	struct vm_struct *area;
 	void *addr;
@@ -1638,8 +1640,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	if (!size || (size >> PAGE_SHIFT) > totalram_pages)
 		goto fail;
 
-	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
-				  start, end, node, gfp_mask, caller);
+	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
+				vm_flags, start, end, node, gfp_mask, caller);
 	if (!area)
 		goto fail;
 
@@ -1688,7 +1690,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
 			    int node, const void *caller)
 {
 	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
-				gfp_mask, prot, node, caller);
+				gfp_mask, prot, 0, node, caller);
 }
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
-- 
cgit v1.2.3


From 6301939d97d079f0d3dbe71e750f4daf5d39fc33 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:40:13 -0800
Subject: module: fix types of device tables aliases

MODULE_DEVICE_TABLE() macro used to create aliases to device tables.
Normally alias should have the same type as aliased symbol.

Device tables are arrays, so they have 'struct type##_device_id[x]'
types. Alias created by MODULE_DEVICE_TABLE() will have non-array type -
	'struct type##_device_id'.

This inconsistency confuses compiler, it could make a wrong assumption
about variable's size which leads KASan to produce a false positive report
about out of bounds access.

For every global variable compiler calls __asan_register_globals() passing
information about global variable (address, size, size with redzone, name
...) __asan_register_globals() poison symbols redzone to detect possible
out of bounds accesses.

When symbol has an alias __asan_register_globals() will be called as for
symbol so for alias.  Compiler determines size of variable by size of
variable's type.  Alias and symbol have the same address, so if alias have
the wrong size part of memory that actually belongs to the symbol could be
poisoned as redzone of alias symbol.

By fixing type of alias symbol we will fix size of it, so
__asan_register_globals() will not poison valid memory.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/module.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index b653d7c0a05a..42999fe2dbd0 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -135,7 +135,7 @@ void trim_init_extable(struct module *m);
 #ifdef MODULE
 /* Creates an alias so file2alias.c can find device table. */
 #define MODULE_DEVICE_TABLE(type, name)					\
-  extern const struct type##_device_id __mod_##type##__##name##_device_table \
+extern const typeof(name) __mod_##type##__##name##_device_table		\
   __attribute__ ((unused, alias(__stringify(name))))
 #else  /* !MODULE */
 #define MODULE_DEVICE_TABLE(type, name)
-- 
cgit v1.2.3


From bebf56a1b176c2e1c9efe44e7e6915532cc682cf Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:40:17 -0800
Subject: kasan: enable instrumentation of global variables

This feature let us to detect accesses out of bounds of global variables.
This will work as for globals in kernel image, so for globals in modules.
Currently this won't work for symbols in user-specified sections (e.g.
__init, __read_mostly, ...)

The idea of this is simple.  Compiler increases each global variable by
redzone size and add constructors invoking __asan_register_globals()
function.  Information about global variable (address, size, size with
redzone ...) passed to __asan_register_globals() so we could poison
variable's redzone.

This patch also forces module_alloc() to return 8*PAGE_SIZE aligned
address making shadow memory handling (
kasan_module_alloc()/kasan_module_free() ) more simple.  Such alignment
guarantees that each shadow page backing modules address space correspond
to only one module_alloc() allocation.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kasan.txt       |  2 +-
 arch/x86/kernel/module.c      | 12 +++++++++-
 arch/x86/mm/kasan_init_64.c   |  2 +-
 include/linux/compiler-gcc4.h |  4 ++++
 include/linux/compiler-gcc5.h |  2 ++
 include/linux/kasan.h         | 10 +++++++++
 kernel/module.c               |  2 ++
 lib/Kconfig.kasan             |  1 +
 mm/kasan/kasan.c              | 52 +++++++++++++++++++++++++++++++++++++++++++
 mm/kasan/kasan.h              | 25 +++++++++++++++++++++
 mm/kasan/report.c             | 22 ++++++++++++++++++
 scripts/Makefile.kasan        |  2 +-
 12 files changed, 132 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt
index f0645a8a992f..092fc10961fe 100644
--- a/Documentation/kasan.txt
+++ b/Documentation/kasan.txt
@@ -9,7 +9,7 @@ a fast and comprehensive solution for finding use-after-free and out-of-bounds
 bugs.
 
 KASan uses compile-time instrumentation for checking every memory access,
-therefore you will need a certain version of GCC >= 4.9.2
+therefore you will need a certain version of GCC > 4.9.2
 
 Currently KASan is supported only for x86_64 architecture and requires that the
 kernel be built with the SLUB allocator.
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e830e61aae05..d1ac80b72c72 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/kasan.h>
 #include <linux/bug.h>
 #include <linux/mm.h>
 #include <linux/gfp.h>
@@ -83,13 +84,22 @@ static unsigned long int get_module_load_offset(void)
 
 void *module_alloc(unsigned long size)
 {
+	void *p;
+
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
-	return __vmalloc_node_range(size, 1,
+
+	p = __vmalloc_node_range(size, MODULE_ALIGN,
 				    MODULES_VADDR + get_module_load_offset(),
 				    MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
 				    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
+	if (p && (kasan_module_alloc(p, size) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+
+	return p;
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 53508708b7aa..4860906c6b9f 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -196,7 +196,7 @@ void __init kasan_init(void)
 			(unsigned long)kasan_mem_to_shadow(_end),
 			NUMA_NO_NODE);
 
-	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_VADDR),
+	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
 			(void *)KASAN_SHADOW_END);
 
 	memset(kasan_zero_page, 0, PAGE_SIZE);
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index d1a558239b1a..769e19864632 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -85,3 +85,7 @@
 #define __HAVE_BUILTIN_BSWAP16__
 #endif
 #endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
+
+#if GCC_VERSION >= 40902
+#define KASAN_ABI_VERSION 3
+#endif
diff --git a/include/linux/compiler-gcc5.h b/include/linux/compiler-gcc5.h
index c8c565952548..efee493714eb 100644
--- a/include/linux/compiler-gcc5.h
+++ b/include/linux/compiler-gcc5.h
@@ -63,3 +63,5 @@
 #define __HAVE_BUILTIN_BSWAP64__
 #define __HAVE_BUILTIN_BSWAP16__
 #endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
+
+#define KASAN_ABI_VERSION 4
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d5310eef3e38..72ba725ddf9c 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -49,8 +49,15 @@ void kasan_krealloc(const void *object, size_t new_size);
 void kasan_slab_alloc(struct kmem_cache *s, void *object);
 void kasan_slab_free(struct kmem_cache *s, void *object);
 
+#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
+
+int kasan_module_alloc(void *addr, size_t size);
+void kasan_module_free(void *addr);
+
 #else /* CONFIG_KASAN */
 
+#define MODULE_ALIGN 1
+
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
 
 static inline void kasan_enable_current(void) {}
@@ -74,6 +81,9 @@ static inline void kasan_krealloc(const void *object, size_t new_size) {}
 static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {}
 static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
 
+static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
+static inline void kasan_module_free(void *addr) {}
+
 #endif /* CONFIG_KASAN */
 
 #endif /* LINUX_KASAN_H */
diff --git a/kernel/module.c b/kernel/module.c
index 82dc1f899e6d..8426ad48362c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
 #include <linux/async.h>
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
+#include <linux/kasan.h>
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
@@ -1813,6 +1814,7 @@ static void unset_module_init_ro_nx(struct module *mod) { }
 void __weak module_memfree(void *module_region)
 {
 	vfree(module_region);
+	kasan_module_free(module_region);
 }
 
 void __weak module_arch_cleanup(struct module *mod)
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 4d47d874335c..4fecaedc80a2 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -6,6 +6,7 @@ if HAVE_ARCH_KASAN
 config KASAN
 	bool "KASan: runtime memory debugger"
 	depends on SLUB_DEBUG
+	select CONSTRUCTORS
 	help
 	  Enables kernel address sanitizer - runtime memory debugger,
 	  designed to find out-of-bounds accesses and use-after-free bugs.
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 799c52b9826c..78fee632a7ee 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -22,6 +22,7 @@
 #include <linux/memblock.h>
 #include <linux/memory.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/printk.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -395,6 +396,57 @@ void kasan_kfree_large(const void *ptr)
 			KASAN_FREE_PAGE);
 }
 
+int kasan_module_alloc(void *addr, size_t size)
+{
+	void *ret;
+	size_t shadow_size;
+	unsigned long shadow_start;
+
+	shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+	shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT,
+			PAGE_SIZE);
+
+	if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+		return -EINVAL;
+
+	ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+			shadow_start + shadow_size,
+			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+			PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+			__builtin_return_address(0));
+	return ret ? 0 : -ENOMEM;
+}
+
+void kasan_module_free(void *addr)
+{
+	vfree(kasan_mem_to_shadow(addr));
+}
+
+static void register_global(struct kasan_global *global)
+{
+	size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+
+	kasan_unpoison_shadow(global->beg, global->size);
+
+	kasan_poison_shadow(global->beg + aligned_size,
+		global->size_with_redzone - aligned_size,
+		KASAN_GLOBAL_REDZONE);
+}
+
+void __asan_register_globals(struct kasan_global *globals, size_t size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		register_global(&globals[i]);
+}
+EXPORT_SYMBOL(__asan_register_globals);
+
+void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+{
+}
+EXPORT_SYMBOL(__asan_unregister_globals);
+
 #define DEFINE_ASAN_LOAD_STORE(size)				\
 	void __asan_load##size(unsigned long addr)		\
 	{							\
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 1fcc1d81a9cf..4986b0acab21 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -11,6 +11,7 @@
 #define KASAN_PAGE_REDZONE      0xFE  /* redzone for kmalloc_large allocations */
 #define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
 #define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
+#define KASAN_GLOBAL_REDZONE    0xFA  /* redzone for global variable */
 
 /*
  * Stack redzone shadow values
@@ -21,6 +22,10 @@
 #define KASAN_STACK_RIGHT       0xF3
 #define KASAN_STACK_PARTIAL     0xF4
 
+/* Don't break randconfig/all*config builds */
+#ifndef KASAN_ABI_VERSION
+#define KASAN_ABI_VERSION 1
+#endif
 
 struct kasan_access_info {
 	const void *access_addr;
@@ -30,6 +35,26 @@ struct kasan_access_info {
 	unsigned long ip;
 };
 
+/* The layout of struct dictated by compiler */
+struct kasan_source_location {
+	const char *filename;
+	int line_no;
+	int column_no;
+};
+
+/* The layout of struct dictated by compiler */
+struct kasan_global {
+	const void *beg;		/* Address of the beginning of the global variable. */
+	size_t size;			/* Size of the global variable. */
+	size_t size_with_redzone;	/* Size of the variable + size of the red zone. 32 bytes aligned */
+	const void *name;
+	const void *module_name;	/* Name of the module where the global variable is declared. */
+	unsigned long has_dynamic_init;	/* This needed for C++ */
+#if KASAN_ABI_VERSION >= 4
+	struct kasan_source_location *location;
+#endif
+};
+
 void kasan_report_error(struct kasan_access_info *info);
 void kasan_report_user_access(struct kasan_access_info *info);
 
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 866732ef3db3..680ceedf810a 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -23,6 +23,8 @@
 #include <linux/types.h>
 #include <linux/kasan.h>
 
+#include <asm/sections.h>
+
 #include "kasan.h"
 #include "../slab.h"
 
@@ -61,6 +63,7 @@ static void print_error_description(struct kasan_access_info *info)
 		break;
 	case KASAN_PAGE_REDZONE:
 	case KASAN_KMALLOC_REDZONE:
+	case KASAN_GLOBAL_REDZONE:
 	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
 		bug_type = "out of bounds access";
 		break;
@@ -80,6 +83,20 @@ static void print_error_description(struct kasan_access_info *info)
 		info->access_size, current->comm, task_pid_nr(current));
 }
 
+static inline bool kernel_or_module_addr(const void *addr)
+{
+	return (addr >= (void *)_stext && addr < (void *)_end)
+		|| (addr >= (void *)MODULES_VADDR
+			&& addr < (void *)MODULES_END);
+}
+
+static inline bool init_task_stack_addr(const void *addr)
+{
+	return addr >= (void *)&init_thread_union.stack &&
+		(addr <= (void *)&init_thread_union.stack +
+			sizeof(init_thread_union.stack));
+}
+
 static void print_address_description(struct kasan_access_info *info)
 {
 	const void *addr = info->access_addr;
@@ -107,6 +124,11 @@ static void print_address_description(struct kasan_access_info *info)
 		dump_page(page, "kasan: bad access detected");
 	}
 
+	if (kernel_or_module_addr(addr)) {
+		if (!init_task_stack_addr(addr))
+			pr_err("Address belongs to variable %pS\n", addr);
+	}
+
 	dump_stack();
 }
 
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 2163b8cc446e..631619b2b118 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -9,7 +9,7 @@ CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address
 
 CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \
 		-fasan-shadow-offset=$(CONFIG_KASAN_SHADOW_OFFSET) \
-		--param asan-stack=1 \
+		--param asan-stack=1 --param asan-globals=1 \
 		--param asan-instrumentation-with-call-threshold=$(call_threshold))
 
 ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),)
-- 
cgit v1.2.3


From d347efeb16d3d5150cb7f8d50b05f388b572840e Mon Sep 17 00:00:00 2001
From: Adrien Schildknecht <adrien+dev@schischi.me>
Date: Thu, 12 Feb 2015 14:01:37 +0100
Subject: mutex: remove unused field "name" in debug mode

This field is unused and uninitialized since commit 9a11b49a8056
("[PATCH] lockdep: better lock debugging")

Signed-off-by: Adrien Schildknecht <adrien+dev@schischi.me>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mutex.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index cc31498fc526..2cb7531e7d7a 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -59,7 +59,6 @@ struct mutex {
 	struct optimistic_spin_queue osq; /* Spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
-	const char 		*name;
 	void			*magic;
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-- 
cgit v1.2.3


From 124cf9117c5f93cc5b324530b7e105b09c729d5d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 13 Feb 2015 23:50:43 +0100
Subject: PM / sleep: Make it possible to quiesce timers during suspend-to-idle

The efficiency of suspend-to-idle depends on being able to keep CPUs
in the deepest available idle states for as much time as possible.
Ideally, they should only be brought out of idle by system wakeup
interrupts.

However, timer interrupts occurring periodically prevent that from
happening and it is not practical to chase all of the "misbehaving"
timers in a whack-a-mole fashion.  A much more effective approach is
to suspend the local ticks for all CPUs and the entire timekeeping
along the lines of what is done during full suspend, which also
helps to keep suspend-to-idle and full suspend reasonably similar.

The idea is to suspend the local tick on each CPU executing
cpuidle_enter_freeze() and to make the last of them suspend the
entire timekeeping.  That should prevent timer interrupts from
triggering until an IO interrupt wakes up one of the CPUs.  It
needs to be done with interrupts disabled on all of the CPUs,
though, because otherwise the suspended clocksource might be
accessed by an interrupt handler which might lead to fatal
consequences.

Unfortunately, the existing ->enter callbacks provided by cpuidle
drivers generally cannot be used for implementing that, because some
of them re-enable interrupts temporarily and some idle entry methods
cause interrupts to be re-enabled automatically on exit.  Also some
of these callbacks manipulate local clock event devices of the CPUs
which really shouldn't be done after suspending their ticks.

To overcome that difficulty, introduce a new cpuidle state callback,
->enter_freeze, that will be guaranteed (1) to keep interrupts
disabled all the time (and return with interrupts disabled) and (2)
not to touch the CPU timer devices.  Modify cpuidle_enter_freeze() to
look for the deepest available idle state with ->enter_freeze present
and to make the CPU execute that callback with suspended tick (and the
last of the online CPUs to execute it with suspended timekeeping).

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/cpuidle/cpuidle.c | 49 +++++++++++++++++++++++++++++++++++++++++-----
 include/linux/cpuidle.h   |  9 +++++++++
 include/linux/tick.h      |  6 +++++-
 kernel/time/tick-common.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/time/timekeeping.c |  4 ++--
 kernel/time/timekeeping.h |  2 ++
 6 files changed, 112 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 23a8d6cc8d30..4d534582514e 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -20,6 +20,7 @@
 #include <linux/hrtimer.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
+#include <linux/tick.h>
 #include <trace/events/power.h>
 
 #include "cpuidle.h"
@@ -69,18 +70,20 @@ int cpuidle_play_dead(void)
  * cpuidle_find_deepest_state - Find deepest state meeting specific conditions.
  * @drv: cpuidle driver for the given CPU.
  * @dev: cpuidle device for the given CPU.
+ * @freeze: Whether or not the state should be suitable for suspend-to-idle.
  */
 static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
-				      struct cpuidle_device *dev)
+				      struct cpuidle_device *dev, bool freeze)
 {
 	unsigned int latency_req = 0;
-	int i, ret = CPUIDLE_DRIVER_STATE_START - 1;
+	int i, ret = freeze ? -1 : CPUIDLE_DRIVER_STATE_START - 1;
 
 	for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
 		struct cpuidle_state_usage *su = &dev->states_usage[i];
 
-		if (s->disabled || su->disable || s->exit_latency <= latency_req)
+		if (s->disabled || su->disable || s->exit_latency <= latency_req
+		    || (freeze && !s->enter_freeze))
 			continue;
 
 		latency_req = s->exit_latency;
@@ -89,10 +92,31 @@ static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 	return ret;
 }
 
+static void enter_freeze_proper(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev, int index)
+{
+	tick_freeze();
+	/*
+	 * The state used here cannot be a "coupled" one, because the "coupled"
+	 * cpuidle mechanism enables interrupts and doing that with timekeeping
+	 * suspended is generally unsafe.
+	 */
+	drv->states[index].enter_freeze(dev, drv, index);
+	WARN_ON(!irqs_disabled());
+	/*
+	 * timekeeping_resume() that will be called by tick_unfreeze() for the
+	 * last CPU executing it calls functions containing RCU read-side
+	 * critical sections, so tell RCU about that.
+	 */
+	RCU_NONIDLE(tick_unfreeze());
+}
+
 /**
  * cpuidle_enter_freeze - Enter an idle state suitable for suspend-to-idle.
  *
- * Find the deepest state available and enter it.
+ * If there are states with the ->enter_freeze callback, find the deepest of
+ * them and enter it with frozen tick.  Otherwise, find the deepest state
+ * available and enter it normally.
  */
 void cpuidle_enter_freeze(void)
 {
@@ -100,7 +124,22 @@ void cpuidle_enter_freeze(void)
 	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
 	int index;
 
-	index = cpuidle_find_deepest_state(drv, dev);
+	/*
+	 * Find the deepest state with ->enter_freeze present, which guarantees
+	 * that interrupts won't be enabled when it exits and allows the tick to
+	 * be frozen safely.
+	 */
+	index = cpuidle_find_deepest_state(drv, dev, true);
+	if (index >= 0) {
+		enter_freeze_proper(drv, dev, index);
+		return;
+	}
+
+	/*
+	 * It is not safe to freeze the tick, find the deepest state available
+	 * at all and try to enter it normally.
+	 */
+	index = cpuidle_find_deepest_state(drv, dev, false);
 	if (index >= 0)
 		cpuidle_enter(drv, dev, index);
 	else
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index f63aabf4ee90..f551a9299ac9 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -50,6 +50,15 @@ struct cpuidle_state {
 			int index);
 
 	int (*enter_dead) (struct cpuidle_device *dev, int index);
+
+	/*
+	 * CPUs execute ->enter_freeze with the local tick or entire timekeeping
+	 * suspended, so it must not re-enable interrupts at any point (even
+	 * temporarily) or attempt to change states of clock event devices.
+	 */
+	void (*enter_freeze) (struct cpuidle_device *dev,
+			      struct cpuidle_driver *drv,
+			      int index);
 };
 
 /* Idle State Flags */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index eda850ca757a..9c085dc12ae9 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -79,6 +79,9 @@ extern void __init tick_init(void);
 extern int tick_is_oneshot_available(void);
 extern struct tick_device *tick_get_device(int cpu);
 
+extern void tick_freeze(void);
+extern void tick_unfreeze(void);
+
 # ifdef CONFIG_HIGH_RES_TIMERS
 extern int tick_init_highres(void);
 extern int tick_program_event(ktime_t expires, int force);
@@ -119,6 +122,8 @@ static inline int tick_oneshot_mode_active(void) { return 0; }
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
 static inline void tick_init(void) { }
+static inline void tick_freeze(void) { }
+static inline void tick_unfreeze(void) { }
 static inline void tick_cancel_sched_timer(int cpu) { }
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
@@ -226,5 +231,4 @@ static inline void tick_nohz_task_switch(struct task_struct *tsk)
 		__tick_nohz_task_switch(tsk);
 }
 
-
 #endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7efeedf53ebd..f7c515595b42 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -394,6 +394,56 @@ void tick_resume(void)
 	}
 }
 
+static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static unsigned int tick_freeze_depth;
+
+/**
+ * tick_freeze - Suspend the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the last online CPU executing the function and if so,
+ * suspend timekeeping.  Otherwise suspend the local tick.
+ *
+ * Call with interrupts disabled.  Must be balanced with %tick_unfreeze().
+ * Interrupts must not be enabled before the subsequent %tick_unfreeze().
+ */
+void tick_freeze(void)
+{
+	raw_spin_lock(&tick_freeze_lock);
+
+	tick_freeze_depth++;
+	if (tick_freeze_depth == num_online_cpus()) {
+		timekeeping_suspend();
+	} else {
+		tick_suspend();
+		tick_suspend_broadcast();
+	}
+
+	raw_spin_unlock(&tick_freeze_lock);
+}
+
+/**
+ * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the first CPU executing the function and if so, resume
+ * timekeeping.  Otherwise resume the local tick.
+ *
+ * Call with interrupts disabled.  Must be balanced with %tick_freeze().
+ * Interrupts must not be enabled after the preceding %tick_freeze().
+ */
+void tick_unfreeze(void)
+{
+	raw_spin_lock(&tick_freeze_lock);
+
+	if (tick_freeze_depth == num_online_cpus())
+		timekeeping_resume();
+	else
+		tick_resume();
+
+	tick_freeze_depth--;
+
+	raw_spin_unlock(&tick_freeze_lock);
+}
+
 /**
  * tick_init - initialize the tick control
  */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index aef5dc722abf..91db94136c10 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1197,7 +1197,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
  * xtime/wall_to_monotonic/jiffies/etc are
  * still managed by arch specific suspend/resume code.
  */
-static void timekeeping_resume(void)
+void timekeeping_resume(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct clocksource *clock = tk->tkr.clock;
@@ -1278,7 +1278,7 @@ static void timekeeping_resume(void)
 	hrtimers_resume();
 }
 
-static int timekeeping_suspend(void)
+int timekeeping_suspend(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long flags;
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index adc1fc98bde3..1d91416055d5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -16,5 +16,7 @@ extern int timekeeping_inject_offset(struct timespec *ts);
 extern s32 timekeeping_get_tai_offset(void);
 extern void timekeeping_set_tai_offset(s32 tai_offset);
 extern void timekeeping_clocktai(struct timespec *ts);
+extern int timekeeping_suspend(void);
+extern void timekeeping_resume(void);
 
 #endif
-- 
cgit v1.2.3


From 2e4cdab0584fa884e0a81c4f45b93ce875c9fcaa Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:58:50 -0800
Subject: mm: allow page fault handlers to perform the COW

Currently COW of an XIP file is done by first bringing in a read-only
mapping, then retrying the fault and copying the page.  It is much more
efficient to tell the fault handler that a COW is being attempted (by
passing in the pre-allocated page in the vm_fault structure), and allow
the handler to perform the COW operation itself.

The handler cannot insert the page itself if there is already a read-only
mapping at that address, so allow the handler to return VM_FAULT_LOCKED
and set the fault_page to be NULL.  This indicates to the MM code that the
i_mmap_lock is held instead of the page lock.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  1 +
 mm/memory.c        | 41 ++++++++++++++++++++++++++++++++---------
 2 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9bee7ec0c31f..47a93928b90f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -224,6 +224,7 @@ struct vm_fault {
 	pgoff_t pgoff;			/* Logical page offset based on vma */
 	void __user *virtual_address;	/* Faulting virtual address */
 
+	struct page *cow_page;		/* Handler may choose to COW */
 	struct page *page;		/* ->fault handlers should return a
 					 * page here, unless VM_FAULT_NOPAGE
 					 * is set (which is also implied by
diff --git a/mm/memory.c b/mm/memory.c
index 1b04e13b9993..8068893697bb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1965,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 	vmf.pgoff = page->index;
 	vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
 	vmf.page = page;
+	vmf.cow_page = NULL;
 
 	ret = vma->vm_ops->page_mkwrite(vma, &vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@@ -2639,7 +2640,8 @@ oom:
  * See filemap_fault() and __lock_page_retry().
  */
 static int __do_fault(struct vm_area_struct *vma, unsigned long address,
-		pgoff_t pgoff, unsigned int flags, struct page **page)
+			pgoff_t pgoff, unsigned int flags,
+			struct page *cow_page, struct page **page)
 {
 	struct vm_fault vmf;
 	int ret;
@@ -2648,10 +2650,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	vmf.pgoff = pgoff;
 	vmf.flags = flags;
 	vmf.page = NULL;
+	vmf.cow_page = cow_page;
 
 	ret = vma->vm_ops->fault(vma, &vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
+	if (!vmf.page)
+		goto out;
 
 	if (unlikely(PageHWPoison(vmf.page))) {
 		if (ret & VM_FAULT_LOCKED)
@@ -2665,6 +2670,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	else
 		VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
 
+ out:
 	*page = vmf.page;
 	return ret;
 }
@@ -2835,7 +2841,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		pte_unmap_unlock(pte, ptl);
 	}
 
-	ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
 
@@ -2875,26 +2881,43 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 	}
 
-	ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		goto uncharge_out;
 
-	copy_user_highpage(new_page, fault_page, address, vma);
+	if (fault_page)
+		copy_user_highpage(new_page, fault_page, address, vma);
 	__SetPageUptodate(new_page);
 
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (unlikely(!pte_same(*pte, orig_pte))) {
 		pte_unmap_unlock(pte, ptl);
-		unlock_page(fault_page);
-		page_cache_release(fault_page);
+		if (fault_page) {
+			unlock_page(fault_page);
+			page_cache_release(fault_page);
+		} else {
+			/*
+			 * The fault handler has no page to lock, so it holds
+			 * i_mmap_lock for read to protect against truncate.
+			 */
+			i_mmap_unlock_read(vma->vm_file->f_mapping);
+		}
 		goto uncharge_out;
 	}
 	do_set_pte(vma, address, new_page, pte, true, true);
 	mem_cgroup_commit_charge(new_page, memcg, false);
 	lru_cache_add_active_or_unevictable(new_page, vma);
 	pte_unmap_unlock(pte, ptl);
-	unlock_page(fault_page);
-	page_cache_release(fault_page);
+	if (fault_page) {
+		unlock_page(fault_page);
+		page_cache_release(fault_page);
+	} else {
+		/*
+		 * The fault handler has no page to lock, so it holds
+		 * i_mmap_lock for read to protect against truncate.
+		 */
+		i_mmap_unlock_read(vma->vm_file->f_mapping);
+	}
 	return ret;
 uncharge_out:
 	mem_cgroup_cancel_charge(new_page, memcg);
@@ -2913,7 +2936,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	int dirtied = 0;
 	int ret, tmp;
 
-	ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
 
-- 
cgit v1.2.3


From fbbbad4bc2101e452b24e6e65d3d5e11314a0b5f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:58:53 -0800
Subject: vfs,ext2: introduce IS_DAX(inode)

Use an inode flag to tag inodes which should avoid using the page cache.
Convert ext2 to use it instead of mapping_is_xip().  Prevent I/Os to files
tagged with the DAX flag from falling back to buffered I/O.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/inode.c    |  9 ++++++---
 fs/ext2/xip.h      |  2 --
 include/linux/fs.h |  6 ++++++
 mm/filemap.c       | 19 ++++++++++++-------
 4 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 36d35c36311d..0cb04486577d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -731,7 +731,7 @@ static int ext2_get_blocks(struct inode *inode,
 		goto cleanup;
 	}
 
-	if (ext2_use_xip(inode->i_sb)) {
+	if (IS_DAX(inode)) {
 		/*
 		 * we need to clear the block
 		 */
@@ -1201,7 +1201,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 
 	inode_dio_wait(inode);
 
-	if (mapping_is_xip(inode->i_mapping))
+	if (IS_DAX(inode))
 		error = xip_truncate_page(inode->i_mapping, newsize);
 	else if (test_opt(inode->i_sb, NOBH))
 		error = nobh_truncate_page(inode->i_mapping,
@@ -1273,7 +1273,8 @@ void ext2_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = EXT2_I(inode)->i_flags;
 
-	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
+				S_DIRSYNC | S_DAX);
 	if (flags & EXT2_SYNC_FL)
 		inode->i_flags |= S_SYNC;
 	if (flags & EXT2_APPEND_FL)
@@ -1284,6 +1285,8 @@ void ext2_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_NOATIME;
 	if (flags & EXT2_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
+	if (test_opt(inode->i_sb, XIP))
+		inode->i_flags |= S_DAX;
 }
 
 /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
index 18b34d2f31b3..29be73781419 100644
--- a/fs/ext2/xip.h
+++ b/fs/ext2/xip.h
@@ -16,9 +16,7 @@ static inline int ext2_use_xip (struct super_block *sb)
 }
 int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
 				void **, unsigned long *);
-#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem)
 #else
-#define mapping_is_xip(map)			0
 #define ext2_xip_verify_sb(sb)			do { } while (0)
 #define ext2_use_xip(sb)			0
 #define ext2_clear_xip_target(inode, chain)	0
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e49f10cc8a73..fb373bb5cf03 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1677,6 +1677,11 @@ struct super_operations {
 #define S_IMA		1024	/* Inode has an associated IMA struct */
 #define S_AUTOMOUNT	2048	/* Automount/referral quasi-directory */
 #define S_NOSEC		4096	/* no suid or xattr security attributes */
+#ifdef CONFIG_FS_XIP
+#define S_DAX		8192	/* Direct Access, avoiding the page cache */
+#else
+#define S_DAX		0	/* Make all the DAX code disappear */
+#endif
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -1714,6 +1719,7 @@ struct super_operations {
 #define IS_IMA(inode)		((inode)->i_flags & S_IMA)
 #define IS_AUTOMOUNT(inode)	((inode)->i_flags & S_AUTOMOUNT)
 #define IS_NOSEC(inode)		((inode)->i_flags & S_NOSEC)
+#define IS_DAX(inode)		((inode)->i_flags & S_DAX)
 
 #define IS_WHITEOUT(inode)	(S_ISCHR(inode->i_mode) && \
 				 (inode)->i_rdev == WHITEOUT_DEV)
diff --git a/mm/filemap.c b/mm/filemap.c
index d9f5336552d7..1578c224285e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1723,9 +1723,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 		 * we've already read everything we wanted to, or if
 		 * there was a short read because we hit EOF, go ahead
 		 * and return.  Otherwise fallthrough to buffered io for
-		 * the rest of the read.
+		 * the rest of the read.  Buffered reads will not work for
+		 * DAX files, so don't bother trying.
 		 */
-		if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
+		if (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||
+		    IS_DAX(inode)) {
 			file_accessed(file);
 			goto out;
 		}
@@ -2587,13 +2589,16 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		loff_t endbyte;
 
 		written = generic_file_direct_write(iocb, from, pos);
-		if (written < 0 || written == count)
-			goto out;
-
 		/*
-		 * direct-io write to a hole: fall through to buffered I/O
-		 * for completing the rest of the request.
+		 * If the write stopped short of completing, fall back to
+		 * buffered writes.  Some filesystems do this for writes to
+		 * holes, for example.  For DAX files, a buffered write will
+		 * not succeed (even if it did, DAX does not handle dirty
+		 * page-cache pages correctly).
 		 */
+		if (written < 0 || written == count || IS_DAX(inode))
+			goto out;
+
 		pos += written;
 		count -= written;
 
-- 
cgit v1.2.3


From d475c6346a38aef3058eba96867bfa726a3cc940 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:58:56 -0800
Subject: dax,ext2: replace XIP read and write with DAX I/O

Use the generic AIO infrastructure instead of custom read and write
methods.  In addition to giving us support for AIO, this adds the missing
locking between read() and truncate().

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS        |   6 ++
 fs/Makefile        |   1 +
 fs/dax.c           | 186 ++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/file.c     |   6 +-
 fs/ext2/inode.c    |   8 +-
 include/linux/fs.h |  12 ++-
 mm/filemap.c       |   6 +-
 mm/filemap_xip.c   | 234 -----------------------------------------------------
 8 files changed, 214 insertions(+), 245 deletions(-)
 create mode 100644 fs/dax.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 348f5c16ef50..8670c224c833 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3151,6 +3151,12 @@ L:	linux-i2c@vger.kernel.org
 S:	Maintained
 F:	drivers/i2c/busses/i2c-diolan-u2c.c
 
+DIRECT ACCESS (DAX)
+M:	Matthew Wilcox <willy@linux.intel.com>
+L:	linux-fsdevel@vger.kernel.org
+S:	Supported
+F:	fs/dax.c
+
 DIRECTORY NOTIFICATION (DNOTIFY)
 M:	Eric Paris <eparis@parisplace.org>
 S:	Maintained
diff --git a/fs/Makefile b/fs/Makefile
index bedff48e8fdc..0534444e257c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD)		+= signalfd.o
 obj-$(CONFIG_TIMERFD)		+= timerfd.o
 obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
+obj-$(CONFIG_FS_XIP)		+= dax.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
diff --git a/fs/dax.c b/fs/dax.c
new file mode 100644
index 000000000000..1a2bdbfa3ea9
--- /dev/null
+++ b/fs/dax.c
@@ -0,0 +1,186 @@
+/*
+ * fs/dax.c - Direct Access filesystem code
+ * Copyright (c) 2013-2014 Intel Corporation
+ * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
+ * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/mutex.h>
+#include <linux/uio.h>
+
+static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
+{
+	unsigned long pfn;
+	sector_t sector = bh->b_blocknr << (blkbits - 9);
+	return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
+}
+
+static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
+			loff_t end)
+{
+	loff_t final = end - pos + first; /* The final byte of the buffer */
+
+	if (first > 0)
+		memset(addr, 0, first);
+	if (final < size)
+		memset(addr + final, 0, size - final);
+}
+
+static bool buffer_written(struct buffer_head *bh)
+{
+	return buffer_mapped(bh) && !buffer_unwritten(bh);
+}
+
+/*
+ * When ext4 encounters a hole, it returns without modifying the buffer_head
+ * which means that we can't trust b_size.  To cope with this, we set b_state
+ * to 0 before calling get_block and, if any bit is set, we know we can trust
+ * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
+ * and would save us time calling get_block repeatedly.
+ */
+static bool buffer_size_valid(struct buffer_head *bh)
+{
+	return bh->b_state != 0;
+}
+
+static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
+			loff_t start, loff_t end, get_block_t get_block,
+			struct buffer_head *bh)
+{
+	ssize_t retval = 0;
+	loff_t pos = start;
+	loff_t max = start;
+	loff_t bh_max = start;
+	void *addr;
+	bool hole = false;
+
+	if (rw != WRITE)
+		end = min(end, i_size_read(inode));
+
+	while (pos < end) {
+		unsigned len;
+		if (pos == max) {
+			unsigned blkbits = inode->i_blkbits;
+			sector_t block = pos >> blkbits;
+			unsigned first = pos - (block << blkbits);
+			long size;
+
+			if (pos == bh_max) {
+				bh->b_size = PAGE_ALIGN(end - pos);
+				bh->b_state = 0;
+				retval = get_block(inode, block, bh,
+								rw == WRITE);
+				if (retval)
+					break;
+				if (!buffer_size_valid(bh))
+					bh->b_size = 1 << blkbits;
+				bh_max = pos - first + bh->b_size;
+			} else {
+				unsigned done = bh->b_size -
+						(bh_max - (pos - first));
+				bh->b_blocknr += done >> blkbits;
+				bh->b_size -= done;
+			}
+
+			hole = (rw != WRITE) && !buffer_written(bh);
+			if (hole) {
+				addr = NULL;
+				size = bh->b_size - first;
+			} else {
+				retval = dax_get_addr(bh, &addr, blkbits);
+				if (retval < 0)
+					break;
+				if (buffer_unwritten(bh) || buffer_new(bh))
+					dax_new_buf(addr, retval, first, pos,
+									end);
+				addr += first;
+				size = retval - first;
+			}
+			max = min(pos + size, end);
+		}
+
+		if (rw == WRITE)
+			len = copy_from_iter(addr, max - pos, iter);
+		else if (!hole)
+			len = copy_to_iter(addr, max - pos, iter);
+		else
+			len = iov_iter_zero(max - pos, iter);
+
+		if (!len)
+			break;
+
+		pos += len;
+		addr += len;
+	}
+
+	return (pos == start) ? retval : pos - start;
+}
+
+/**
+ * dax_do_io - Perform I/O to a DAX file
+ * @rw: READ to read or WRITE to write
+ * @iocb: The control block for this I/O
+ * @inode: The file which the I/O is directed at
+ * @iter: The addresses to do I/O from or to
+ * @pos: The file offset where the I/O starts
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ * @end_io: A filesystem callback for I/O completion
+ * @flags: See below
+ *
+ * This function uses the same locking scheme as do_blockdev_direct_IO:
+ * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
+ * caller for writes.  For reads, we take and release the i_mutex ourselves.
+ * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
+ * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
+ * is in progress.
+ */
+ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
+			struct iov_iter *iter, loff_t pos,
+			get_block_t get_block, dio_iodone_t end_io, int flags)
+{
+	struct buffer_head bh;
+	ssize_t retval = -EINVAL;
+	loff_t end = pos + iov_iter_count(iter);
+
+	memset(&bh, 0, sizeof(bh));
+
+	if ((flags & DIO_LOCKING) && (rw == READ)) {
+		struct address_space *mapping = inode->i_mapping;
+		mutex_lock(&inode->i_mutex);
+		retval = filemap_write_and_wait_range(mapping, pos, end - 1);
+		if (retval) {
+			mutex_unlock(&inode->i_mutex);
+			goto out;
+		}
+	}
+
+	/* Protects against truncate */
+	atomic_inc(&inode->i_dio_count);
+
+	retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
+
+	if ((flags & DIO_LOCKING) && (rw == READ))
+		mutex_unlock(&inode->i_mutex);
+
+	if ((retval > 0) && end_io)
+		end_io(iocb, pos, retval, bh.b_private);
+
+	inode_dio_done(inode);
+ out:
+	return retval;
+}
+EXPORT_SYMBOL_GPL(dax_do_io);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 7c87b22a7228..a247123fd798 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -81,8 +81,10 @@ const struct file_operations ext2_file_operations = {
 #ifdef CONFIG_EXT2_FS_XIP
 const struct file_operations ext2_xip_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= xip_file_read,
-	.write		= xip_file_write,
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
 	.unlocked_ioctl = ext2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0cb04486577d..3ccd5fd47d66 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -859,7 +859,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
+	if (IS_DAX(inode))
+		ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
+				NULL, DIO_LOCKING);
+	else
+		ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+					 ext2_get_block);
 	if (ret < 0 && (rw & WRITE))
 		ext2_write_failed(mapping, offset + count);
 	return ret;
@@ -888,6 +893,7 @@ const struct address_space_operations ext2_aops = {
 const struct address_space_operations ext2_aops_xip = {
 	.bmap			= ext2_bmap,
 	.get_xip_mem		= ext2_get_xip_mem,
+	.direct_IO		= ext2_direct_IO,
 };
 
 const struct address_space_operations ext2_nobh_aops = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb373bb5cf03..241c3c030fb5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2587,12 +2587,11 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 
+ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
+		loff_t, get_block_t, dio_iodone_t, int flags);
+
 #ifdef CONFIG_FS_XIP
-extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
-			     loff_t *ppos);
 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
-extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
-			      size_t len, loff_t *ppos);
 extern int xip_truncate_page(struct address_space *mapping, loff_t from);
 #else
 static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
@@ -2756,6 +2755,11 @@ extern int generic_show_options(struct seq_file *m, struct dentry *root);
 extern void save_mount_options(struct super_block *sb, char *options);
 extern void replace_mount_options(struct super_block *sb, char *options);
 
+static inline bool io_is_direct(struct file *filp)
+{
+	return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp));
+}
+
 static inline ino_t parent_ino(struct dentry *dentry)
 {
 	ino_t res;
diff --git a/mm/filemap.c b/mm/filemap.c
index 1578c224285e..ad7242043bdb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	loff_t *ppos = &iocb->ki_pos;
 	loff_t pos = *ppos;
 
-	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
-	if (file->f_flags & O_DIRECT) {
+	if (io_is_direct(file)) {
 		struct address_space *mapping = file->f_mapping;
 		struct inode *inode = mapping->host;
 		size_t count = iov_iter_count(iter);
@@ -2584,8 +2583,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (err)
 		goto out;
 
-	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
-	if (unlikely(file->f_flags & O_DIRECT)) {
+	if (io_is_direct(file)) {
 		loff_t endbyte;
 
 		written = generic_file_direct_write(iocb, from, pos);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 59e1c5585748..9c869f402c07 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -42,119 +42,6 @@ static struct page *xip_sparse_page(void)
 	return __xip_sparse_page;
 }
 
-/*
- * This is a file read routine for execute in place files, and uses
- * the mapping->a_ops->get_xip_mem() function for the actual low-level
- * stuff.
- *
- * Note the struct file* is not used at all.  It may be NULL.
- */
-static ssize_t
-do_xip_mapping_read(struct address_space *mapping,
-		    struct file_ra_state *_ra,
-		    struct file *filp,
-		    char __user *buf,
-		    size_t len,
-		    loff_t *ppos)
-{
-	struct inode *inode = mapping->host;
-	pgoff_t index, end_index;
-	unsigned long offset;
-	loff_t isize, pos;
-	size_t copied = 0, error = 0;
-
-	BUG_ON(!mapping->a_ops->get_xip_mem);
-
-	pos = *ppos;
-	index = pos >> PAGE_CACHE_SHIFT;
-	offset = pos & ~PAGE_CACHE_MASK;
-
-	isize = i_size_read(inode);
-	if (!isize)
-		goto out;
-
-	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
-	do {
-		unsigned long nr, left;
-		void *xip_mem;
-		unsigned long xip_pfn;
-		int zero = 0;
-
-		/* nr is the maximum number of bytes to copy from this page */
-		nr = PAGE_CACHE_SIZE;
-		if (index >= end_index) {
-			if (index > end_index)
-				goto out;
-			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
-			if (nr <= offset) {
-				goto out;
-			}
-		}
-		nr = nr - offset;
-		if (nr > len - copied)
-			nr = len - copied;
-
-		error = mapping->a_ops->get_xip_mem(mapping, index, 0,
-							&xip_mem, &xip_pfn);
-		if (unlikely(error)) {
-			if (error == -ENODATA) {
-				/* sparse */
-				zero = 1;
-			} else
-				goto out;
-		}
-
-		/* If users can be writing to this page using arbitrary
-		 * virtual addresses, take care about potential aliasing
-		 * before reading the page on the kernel side.
-		 */
-		if (mapping_writably_mapped(mapping))
-			/* address based flush */ ;
-
-		/*
-		 * Ok, we have the mem, so now we can copy it to user space...
-		 *
-		 * The actor routine returns how many bytes were actually used..
-		 * NOTE! This may not be the same as how much of a user buffer
-		 * we filled up (we may be padding etc), so we can only update
-		 * "pos" here (the actor routine has to update the user buffer
-		 * pointers and the remaining count).
-		 */
-		if (!zero)
-			left = __copy_to_user(buf+copied, xip_mem+offset, nr);
-		else
-			left = __clear_user(buf + copied, nr);
-
-		if (left) {
-			error = -EFAULT;
-			goto out;
-		}
-
-		copied += (nr - left);
-		offset += (nr - left);
-		index += offset >> PAGE_CACHE_SHIFT;
-		offset &= ~PAGE_CACHE_MASK;
-	} while (copied < len);
-
-out:
-	*ppos = pos + copied;
-	if (filp)
-		file_accessed(filp);
-
-	return (copied ? copied : error);
-}
-
-ssize_t
-xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
-{
-	if (!access_ok(VERIFY_WRITE, buf, len))
-		return -EFAULT;
-
-	return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
-			    buf, len, ppos);
-}
-EXPORT_SYMBOL_GPL(xip_file_read);
-
 /*
  * __xip_unmap is invoked from xip_unmap and xip_write
  *
@@ -341,127 +228,6 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
 }
 EXPORT_SYMBOL_GPL(xip_file_mmap);
 
-static ssize_t
-__xip_file_write(struct file *filp, const char __user *buf,
-		  size_t count, loff_t pos, loff_t *ppos)
-{
-	struct address_space * mapping = filp->f_mapping;
-	const struct address_space_operations *a_ops = mapping->a_ops;
-	struct inode 	*inode = mapping->host;
-	long		status = 0;
-	size_t		bytes;
-	ssize_t		written = 0;
-
-	BUG_ON(!mapping->a_ops->get_xip_mem);
-
-	do {
-		unsigned long index;
-		unsigned long offset;
-		size_t copied;
-		void *xip_mem;
-		unsigned long xip_pfn;
-
-		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-		index = pos >> PAGE_CACHE_SHIFT;
-		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count)
-			bytes = count;
-
-		status = a_ops->get_xip_mem(mapping, index, 0,
-						&xip_mem, &xip_pfn);
-		if (status == -ENODATA) {
-			/* we allocate a new page unmap it */
-			mutex_lock(&xip_sparse_mutex);
-			status = a_ops->get_xip_mem(mapping, index, 1,
-							&xip_mem, &xip_pfn);
-			mutex_unlock(&xip_sparse_mutex);
-			if (!status)
-				/* unmap page at pgoff from all other vmas */
-				__xip_unmap(mapping, index);
-		}
-
-		if (status)
-			break;
-
-		copied = bytes -
-			__copy_from_user_nocache(xip_mem + offset, buf, bytes);
-
-		if (likely(copied > 0)) {
-			status = copied;
-
-			if (status >= 0) {
-				written += status;
-				count -= status;
-				pos += status;
-				buf += status;
-			}
-		}
-		if (unlikely(copied != bytes))
-			if (status >= 0)
-				status = -EFAULT;
-		if (status < 0)
-			break;
-	} while (count);
-	*ppos = pos;
-	/*
-	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold i_mutex.
-	 */
-	if (pos > inode->i_size) {
-		i_size_write(inode, pos);
-		mark_inode_dirty(inode);
-	}
-
-	return written ? written : status;
-}
-
-ssize_t
-xip_file_write(struct file *filp, const char __user *buf, size_t len,
-	       loff_t *ppos)
-{
-	struct address_space *mapping = filp->f_mapping;
-	struct inode *inode = mapping->host;
-	size_t count;
-	loff_t pos;
-	ssize_t ret;
-
-	mutex_lock(&inode->i_mutex);
-
-	if (!access_ok(VERIFY_READ, buf, len)) {
-		ret=-EFAULT;
-		goto out_up;
-	}
-
-	pos = *ppos;
-	count = len;
-
-	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = inode_to_bdi(inode);
-
-	ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
-	if (ret)
-		goto out_backing;
-	if (count == 0)
-		goto out_backing;
-
-	ret = file_remove_suid(filp);
-	if (ret)
-		goto out_backing;
-
-	ret = file_update_time(filp);
-	if (ret)
-		goto out_backing;
-
-	ret = __xip_file_write (filp, buf, count, pos, ppos);
-
- out_backing:
-	current->backing_dev_info = NULL;
- out_up:
-	mutex_unlock(&inode->i_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(xip_file_write);
-
 /*
  * truncate a page used for execute in place
  * functionality is analog to block_truncate_page but does use get_xip_mem
-- 
cgit v1.2.3


From 289c6aedac981533331428bc933fff21ae332c9e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:58:59 -0800
Subject: dax,ext2: replace ext2_clear_xip_target with dax_clear_blocks

This is practically generic code; other filesystems will want to call it
from other places, but there's nothing ext2-specific about it.

Make it a little more generic by allowing it to take a count of the number
of bytes to zero rather than fixing it to a single page.  Thanks to Dave
Hansen for suggesting that I need to call cond_resched() if zeroing more
than one page.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c           | 37 +++++++++++++++++++++++++++++++++++++
 fs/ext2/inode.c    |  8 +++++---
 fs/ext2/xip.c      | 14 --------------
 fs/ext2/xip.h      |  3 ---
 include/linux/fs.h |  1 +
 5 files changed, 43 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 1a2bdbfa3ea9..69c3126a05b4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -20,8 +20,45 @@
 #include <linux/fs.h>
 #include <linux/genhd.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
 #include <linux/uio.h>
 
+int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+{
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	sector_t sector = block << (inode->i_blkbits - 9);
+
+	might_sleep();
+	do {
+		void *addr;
+		unsigned long pfn;
+		long count;
+
+		count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+		if (count < 0)
+			return count;
+		BUG_ON(size < count);
+		while (count > 0) {
+			unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
+			if (pgsz > count)
+				pgsz = count;
+			if (pgsz < PAGE_SIZE)
+				memset(addr, 0, pgsz);
+			else
+				clear_page(addr);
+			addr += pgsz;
+			size -= pgsz;
+			count -= pgsz;
+			BUG_ON(pgsz & 511);
+			sector += pgsz / 512;
+			cond_resched();
+		}
+	} while (size);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dax_clear_blocks);
+
 static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
 {
 	unsigned long pfn;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 3ccd5fd47d66..52978b853226 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -733,10 +733,12 @@ static int ext2_get_blocks(struct inode *inode,
 
 	if (IS_DAX(inode)) {
 		/*
-		 * we need to clear the block
+		 * block must be initialised before we put it in the tree
+		 * so that it's not found by another thread before it's
+		 * initialised
 		 */
-		err = ext2_clear_xip_target (inode,
-			le32_to_cpu(chain[depth-1].key));
+		err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
+						1 << inode->i_blkbits);
 		if (err) {
 			mutex_unlock(&ei->truncate_mutex);
 			goto cleanup;
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index bbc5fec6ff7f..8cfca3a4cd58 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -42,20 +42,6 @@ __ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
 	return rc;
 }
 
-int
-ext2_clear_xip_target(struct inode *inode, sector_t block)
-{
-	void *kaddr;
-	unsigned long pfn;
-	long size;
-
-	size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE);
-	if (size < 0)
-		return size;
-	clear_page(kaddr);
-	return 0;
-}
-
 void ext2_xip_verify_sb(struct super_block *sb)
 {
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
index 29be73781419..b2592f2f3c9d 100644
--- a/fs/ext2/xip.h
+++ b/fs/ext2/xip.h
@@ -7,8 +7,6 @@
 
 #ifdef CONFIG_EXT2_FS_XIP
 extern void ext2_xip_verify_sb (struct super_block *);
-extern int ext2_clear_xip_target (struct inode *, sector_t);
-
 static inline int ext2_use_xip (struct super_block *sb)
 {
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
@@ -19,6 +17,5 @@ int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
 #else
 #define ext2_xip_verify_sb(sb)			do { } while (0)
 #define ext2_use_xip(sb)			0
-#define ext2_clear_xip_target(inode, chain)	0
 #define ext2_get_xip_mem			NULL
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 241c3c030fb5..8084934a5676 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2589,6 +2589,7 @@ extern int nonseekable_open(struct inode * inode, struct file * filp);
 
 ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
 		loff_t, get_block_t, dio_iodone_t, int flags);
+int dax_clear_blocks(struct inode *, sector_t block, long size);
 
 #ifdef CONFIG_FS_XIP
 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
-- 
cgit v1.2.3


From f7ca90b160307d63aaedab8bd451c24a182db20f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:59:02 -0800
Subject: dax,ext2: replace the XIP page fault handler with the DAX page fault
 handler

Instead of calling aops->get_xip_mem from the fault handler, the
filesystem passes a get_block_t that is used to find the appropriate
blocks.

This requires that all architectures implement copy_user_page().  At the
time of writing, mips and arm do not.  Patches exist and are in progress.

[akpm@linux-foundation.org: remap_file_pages went away]
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c           | 241 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/file.c     |  34 +++++++-
 include/linux/fs.h |   4 +-
 mm/filemap_xip.c   | 206 ---------------------------------------------
 4 files changed, 276 insertions(+), 209 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 69c3126a05b4..553e55b93495 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -19,9 +19,13 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/uio.h>
+#include <linux/vmstat.h>
 
 int dax_clear_blocks(struct inode *inode, sector_t block, long size)
 {
@@ -221,3 +225,240 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
 	return retval;
 }
 EXPORT_SYMBOL_GPL(dax_do_io);
+
+/*
+ * The user has performed a load from a hole in the file.  Allocating
+ * a new page in the file would cause excessive storage usage for
+ * workloads with sparse files.  We allocate a page cache page instead.
+ * We'll kick it out of the page cache if it's ever written to,
+ * otherwise it will simply fall out of the page cache under memory
+ * pressure without ever having been dirtied.
+ */
+static int dax_load_hole(struct address_space *mapping, struct page *page,
+							struct vm_fault *vmf)
+{
+	unsigned long size;
+	struct inode *inode = mapping->host;
+	if (!page)
+		page = find_or_create_page(mapping, vmf->pgoff,
+						GFP_KERNEL | __GFP_ZERO);
+	if (!page)
+		return VM_FAULT_OOM;
+	/* Recheck i_size under page lock to avoid truncate race */
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size) {
+		unlock_page(page);
+		page_cache_release(page);
+		return VM_FAULT_SIGBUS;
+	}
+
+	vmf->page = page;
+	return VM_FAULT_LOCKED;
+}
+
+static int copy_user_bh(struct page *to, struct buffer_head *bh,
+			unsigned blkbits, unsigned long vaddr)
+{
+	void *vfrom, *vto;
+	if (dax_get_addr(bh, &vfrom, blkbits) < 0)
+		return -EIO;
+	vto = kmap_atomic(to);
+	copy_user_page(vto, vfrom, vaddr, to);
+	kunmap_atomic(vto);
+	return 0;
+}
+
+static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+			struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct address_space *mapping = inode->i_mapping;
+	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	void *addr;
+	unsigned long pfn;
+	pgoff_t size;
+	int error;
+
+	i_mmap_lock_read(mapping);
+
+	/*
+	 * Check truncate didn't happen while we were allocating a block.
+	 * If it did, this block may or may not be still allocated to the
+	 * file.  We can't tell the filesystem to free it because we can't
+	 * take i_mutex here.  In the worst case, the file still has blocks
+	 * allocated past the end of the file.
+	 */
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (unlikely(vmf->pgoff >= size)) {
+		error = -EIO;
+		goto out;
+	}
+
+	error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
+	if (error < 0)
+		goto out;
+	if (error < PAGE_SIZE) {
+		error = -EIO;
+		goto out;
+	}
+
+	if (buffer_unwritten(bh) || buffer_new(bh))
+		clear_page(addr);
+
+	error = vm_insert_mixed(vma, vaddr, pfn);
+
+ out:
+	i_mmap_unlock_read(mapping);
+
+	if (bh->b_end_io)
+		bh->b_end_io(bh, 1);
+
+	return error;
+}
+
+static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+			get_block_t get_block)
+{
+	struct file *file = vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	struct page *page;
+	struct buffer_head bh;
+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	unsigned blkbits = inode->i_blkbits;
+	sector_t block;
+	pgoff_t size;
+	int error;
+	int major = 0;
+
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size)
+		return VM_FAULT_SIGBUS;
+
+	memset(&bh, 0, sizeof(bh));
+	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+	bh.b_size = PAGE_SIZE;
+
+ repeat:
+	page = find_get_page(mapping, vmf->pgoff);
+	if (page) {
+		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+			page_cache_release(page);
+			return VM_FAULT_RETRY;
+		}
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto repeat;
+		}
+		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (unlikely(vmf->pgoff >= size)) {
+			/*
+			 * We have a struct page covering a hole in the file
+			 * from a read fault and we've raced with a truncate
+			 */
+			error = -EIO;
+			goto unlock_page;
+		}
+	}
+
+	error = get_block(inode, block, &bh, 0);
+	if (!error && (bh.b_size < PAGE_SIZE))
+		error = -EIO;		/* fs corruption? */
+	if (error)
+		goto unlock_page;
+
+	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
+		if (vmf->flags & FAULT_FLAG_WRITE) {
+			error = get_block(inode, block, &bh, 1);
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			major = VM_FAULT_MAJOR;
+			if (!error && (bh.b_size < PAGE_SIZE))
+				error = -EIO;
+			if (error)
+				goto unlock_page;
+		} else {
+			return dax_load_hole(mapping, page, vmf);
+		}
+	}
+
+	if (vmf->cow_page) {
+		struct page *new_page = vmf->cow_page;
+		if (buffer_written(&bh))
+			error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+		else
+			clear_user_highpage(new_page, vaddr);
+		if (error)
+			goto unlock_page;
+		vmf->page = page;
+		if (!page) {
+			i_mmap_lock_read(mapping);
+			/* Check we didn't race with truncate */
+			size = (i_size_read(inode) + PAGE_SIZE - 1) >>
+								PAGE_SHIFT;
+			if (vmf->pgoff >= size) {
+				i_mmap_unlock_read(mapping);
+				error = -EIO;
+				goto out;
+			}
+		}
+		return VM_FAULT_LOCKED;
+	}
+
+	/* Check we didn't race with a read fault installing a new page */
+	if (!page && major)
+		page = find_lock_page(mapping, vmf->pgoff);
+
+	if (page) {
+		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+							PAGE_CACHE_SIZE, 0);
+		delete_from_page_cache(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	error = dax_insert_mapping(inode, &bh, vma, vmf);
+
+ out:
+	if (error == -ENOMEM)
+		return VM_FAULT_OOM | major;
+	/* -EBUSY is fine, somebody else faulted on the same PTE */
+	if ((error < 0) && (error != -EBUSY))
+		return VM_FAULT_SIGBUS | major;
+	return VM_FAULT_NOPAGE | major;
+
+ unlock_page:
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	goto out;
+}
+
+/**
+ * dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files.
+ */
+int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+			get_block_t get_block)
+{
+	int result;
+	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+	if (vmf->flags & FAULT_FLAG_WRITE) {
+		sb_start_pagefault(sb);
+		file_update_time(vma->vm_file);
+	}
+	result = do_dax_fault(vma, vmf, get_block);
+	if (vmf->flags & FAULT_FLAG_WRITE)
+		sb_end_pagefault(sb);
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(dax_fault);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a247123fd798..a61c93fd9dce 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -25,6 +25,36 @@
 #include "xattr.h"
 #include "acl.h"
 
+#ifdef CONFIG_EXT2_FS_XIP
+static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return dax_fault(vma, vmf, ext2_get_block);
+}
+
+static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return dax_mkwrite(vma, vmf, ext2_get_block);
+}
+
+static const struct vm_operations_struct ext2_dax_vm_ops = {
+	.fault		= ext2_dax_fault,
+	.page_mkwrite	= ext2_dax_mkwrite,
+};
+
+static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if (!IS_DAX(file_inode(file)))
+		return generic_file_mmap(file, vma);
+
+	file_accessed(file);
+	vma->vm_ops = &ext2_dax_vm_ops;
+	vma->vm_flags |= VM_MIXEDMAP;
+	return 0;
+}
+#else
+#define ext2_file_mmap	generic_file_mmap
+#endif
+
 /*
  * Called when filp is released. This happens when all file descriptors
  * for a single struct file are closed. Note that different open() calls
@@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
 #endif
-	.mmap		= generic_file_mmap,
+	.mmap		= ext2_file_mmap,
 	.open		= dquot_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_fsync,
@@ -89,7 +119,7 @@ const struct file_operations ext2_xip_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
 #endif
-	.mmap		= xip_file_mmap,
+	.mmap		= ext2_file_mmap,
 	.open		= dquot_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_fsync,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8084934a5676..6bad6d4c579b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -51,6 +51,7 @@ struct swap_info_struct;
 struct seq_file;
 struct workqueue_struct;
 struct iov_iter;
+struct vm_fault;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -2590,9 +2591,10 @@ extern int nonseekable_open(struct inode * inode, struct file * filp);
 ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
 		loff_t, get_block_t, dio_iodone_t, int flags);
 int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+#define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
 
 #ifdef CONFIG_FS_XIP
-extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
 extern int xip_truncate_page(struct address_space *mapping, loff_t from);
 #else
 static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9c869f402c07..59fb387b2238 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -22,212 +22,6 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-/*
- * We do use our own empty page to avoid interference with other users
- * of ZERO_PAGE(), such as /dev/zero
- */
-static DEFINE_MUTEX(xip_sparse_mutex);
-static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq);
-static struct page *__xip_sparse_page;
-
-/* called under xip_sparse_mutex */
-static struct page *xip_sparse_page(void)
-{
-	if (!__xip_sparse_page) {
-		struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
-
-		if (page)
-			__xip_sparse_page = page;
-	}
-	return __xip_sparse_page;
-}
-
-/*
- * __xip_unmap is invoked from xip_unmap and xip_write
- *
- * This function walks all vmas of the address_space and unmaps the
- * __xip_sparse_page when found at pgoff.
- */
-static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
-{
-	struct vm_area_struct *vma;
-	struct page *page;
-	unsigned count;
-	int locked = 0;
-
-	count = read_seqcount_begin(&xip_sparse_seq);
-
-	page = __xip_sparse_page;
-	if (!page)
-		return;
-
-retry:
-	i_mmap_lock_read(mapping);
-	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-		pte_t *pte, pteval;
-		spinlock_t *ptl;
-		struct mm_struct *mm = vma->vm_mm;
-		unsigned long address = vma->vm_start +
-			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-
-		BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-		pte = page_check_address(page, mm, address, &ptl, 1);
-		if (pte) {
-			/* Nuke the page table entry. */
-			flush_cache_page(vma, address, pte_pfn(*pte));
-			pteval = ptep_clear_flush(vma, address, pte);
-			page_remove_rmap(page);
-			dec_mm_counter(mm, MM_FILEPAGES);
-			BUG_ON(pte_dirty(pteval));
-			pte_unmap_unlock(pte, ptl);
-			/* must invalidate_page _before_ freeing the page */
-			mmu_notifier_invalidate_page(mm, address);
-			page_cache_release(page);
-		}
-	}
-	i_mmap_unlock_read(mapping);
-
-	if (locked) {
-		mutex_unlock(&xip_sparse_mutex);
-	} else if (read_seqcount_retry(&xip_sparse_seq, count)) {
-		mutex_lock(&xip_sparse_mutex);
-		locked = 1;
-		goto retry;
-	}
-}
-
-/*
- * xip_fault() is invoked via the vma operations vector for a
- * mapped memory region to read in file data during a page fault.
- *
- * This function is derived from filemap_fault, but used for execute in place
- */
-static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	pgoff_t size;
-	void *xip_mem;
-	unsigned long xip_pfn;
-	struct page *page;
-	int error;
-
-	/* XXX: are VM_FAULT_ codes OK? */
-again:
-	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (vmf->pgoff >= size)
-		return VM_FAULT_SIGBUS;
-
-	error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
-						&xip_mem, &xip_pfn);
-	if (likely(!error))
-		goto found;
-	if (error != -ENODATA)
-		return VM_FAULT_OOM;
-
-	/* sparse block */
-	if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
-	    (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
-	    (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
-		int err;
-
-		/* maybe shared writable, allocate new block */
-		mutex_lock(&xip_sparse_mutex);
-		error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
-							&xip_mem, &xip_pfn);
-		mutex_unlock(&xip_sparse_mutex);
-		if (error)
-			return VM_FAULT_SIGBUS;
-		/* unmap sparse mappings at pgoff from all other vmas */
-		__xip_unmap(mapping, vmf->pgoff);
-
-found:
-		/*
-		 * We must recheck i_size under i_mmap_rwsem to prevent races
-		 * with truncation
-		 */
-		i_mmap_lock_read(mapping);
-		size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-							PAGE_CACHE_SHIFT;
-		if (unlikely(vmf->pgoff >= size)) {
-			i_mmap_unlock_read(mapping);
-			return VM_FAULT_SIGBUS;
-		}
-		err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
-							xip_pfn);
-		i_mmap_unlock_read(mapping);
-		if (err == -ENOMEM)
-			return VM_FAULT_OOM;
-		/*
-		 * err == -EBUSY is fine, we've raced against another thread
-		 * that faulted-in the same page
-		 */
-		if (err != -EBUSY)
-			BUG_ON(err);
-		return VM_FAULT_NOPAGE;
-	} else {
-		int err, ret = VM_FAULT_OOM;
-
-		mutex_lock(&xip_sparse_mutex);
-		write_seqcount_begin(&xip_sparse_seq);
-		error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
-							&xip_mem, &xip_pfn);
-		if (unlikely(!error)) {
-			write_seqcount_end(&xip_sparse_seq);
-			mutex_unlock(&xip_sparse_mutex);
-			goto again;
-		}
-		if (error != -ENODATA)
-			goto out;
-
-		/*
-		 * We must recheck i_size under i_mmap_rwsem to prevent races
-		 * with truncation
-		 */
-		i_mmap_lock_read(mapping);
-		size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-							PAGE_CACHE_SHIFT;
-		if (unlikely(vmf->pgoff >= size)) {
-			ret = VM_FAULT_SIGBUS;
-			goto unlock;
-		}
-		/* not shared and writable, use xip_sparse_page() */
-		page = xip_sparse_page();
-		if (!page)
-			goto unlock;
-		err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
-							page);
-		if (err == -ENOMEM)
-			goto unlock;
-
-		ret = VM_FAULT_NOPAGE;
-unlock:
-		i_mmap_unlock_read(mapping);
-out:
-		write_seqcount_end(&xip_sparse_seq);
-		mutex_unlock(&xip_sparse_mutex);
-
-		return ret;
-	}
-}
-
-static const struct vm_operations_struct xip_file_vm_ops = {
-	.fault	= xip_file_fault,
-	.page_mkwrite	= filemap_page_mkwrite,
-};
-
-int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
-	BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
-
-	file_accessed(file);
-	vma->vm_ops = &xip_file_vm_ops;
-	vma->vm_flags |= VM_MIXEDMAP;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(xip_file_mmap);
-
 /*
  * truncate a page used for execute in place
  * functionality is analog to block_truncate_page but does use get_xip_mem
-- 
cgit v1.2.3


From 4c0ccfef2e9f7418a6eb0bf07a2fc8f216365b18 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:59:06 -0800
Subject: dax,ext2: replace xip_truncate_page with dax_truncate_page

It takes a get_block parameter just like nobh_truncate_page() and
block_truncate_page()

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c           | 44 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/inode.c    |  2 +-
 include/linux/fs.h | 10 +---------
 mm/filemap_xip.c   | 40 ----------------------------------------
 4 files changed, 46 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 553e55b93495..ebf9b2231b0f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -462,3 +462,47 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	return result;
 }
 EXPORT_SYMBOL_GPL(dax_fault);
+
+/**
+ * dax_truncate_page - handle a partial page being truncated in a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * Similar to block_truncate_page(), this function can be called by a
+ * filesystem when it is truncating an DAX file to handle the partial page.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks.  Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmaped.
+ */
+int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+{
+	struct buffer_head bh;
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned length = PAGE_CACHE_ALIGN(from) - from;
+	int err;
+
+	/* Block boundary? Nothing to do */
+	if (!length)
+		return 0;
+
+	memset(&bh, 0, sizeof(bh));
+	bh.b_size = PAGE_CACHE_SIZE;
+	err = get_block(inode, index, &bh, 0);
+	if (err < 0)
+		return err;
+	if (buffer_written(&bh)) {
+		void *addr;
+		err = dax_get_addr(&bh, &addr, inode->i_blkbits);
+		if (err < 0)
+			return err;
+		memset(addr + offset, 0, length);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 52978b853226..5ac0a3461b3c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1210,7 +1210,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 	inode_dio_wait(inode);
 
 	if (IS_DAX(inode))
-		error = xip_truncate_page(inode->i_mapping, newsize);
+		error = dax_truncate_page(inode, newsize, ext2_get_block);
 	else if (test_opt(inode->i_sb, NOBH))
 		error = nobh_truncate_page(inode->i_mapping,
 				newsize, ext2_get_block);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6bad6d4c579b..2c8f9055af38 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2591,18 +2591,10 @@ extern int nonseekable_open(struct inode * inode, struct file * filp);
 ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
 		loff_t, get_block_t, dio_iodone_t, int flags);
 int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
 
-#ifdef CONFIG_FS_XIP
-extern int xip_truncate_page(struct address_space *mapping, loff_t from);
-#else
-static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
-{
-	return 0;
-}
-#endif
-
 #ifdef CONFIG_BLOCK
 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
 			    loff_t file_offset);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 59fb387b2238..8e3f99b61959 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -22,43 +22,3 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-/*
- * truncate a page used for execute in place
- * functionality is analog to block_truncate_page but does use get_xip_mem
- * to get the page instead of page cache
- */
-int
-xip_truncate_page(struct address_space *mapping, loff_t from)
-{
-	pgoff_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned blocksize;
-	unsigned length;
-	void *xip_mem;
-	unsigned long xip_pfn;
-	int err;
-
-	BUG_ON(!mapping->a_ops->get_xip_mem);
-
-	blocksize = 1 << mapping->host->i_blkbits;
-	length = offset & (blocksize - 1);
-
-	/* Block boundary? Nothing to do */
-	if (!length)
-		return 0;
-
-	length = blocksize - length;
-
-	err = mapping->a_ops->get_xip_mem(mapping, index, 0,
-						&xip_mem, &xip_pfn);
-	if (unlikely(err)) {
-		if (err == -ENODATA)
-			/* Hole? No need to truncate */
-			return 0;
-		else
-			return err;
-	}
-	memset(xip_mem + offset, 0, length);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(xip_truncate_page);
-- 
cgit v1.2.3


From e748dcd095ddee50e7a7deda2e26247715318a2e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:59:12 -0800
Subject: vfs: remove get_xip_mem

All callers of get_xip_mem() are now gone.  Remove checks for it,
initialisers of it, documentation of it and the only implementation of it.
 Also remove mm/filemap_xip.c as it is now empty.  Also remove
documentation of the long-gone get_xip_page().

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking |  3 ---
 Documentation/filesystems/vfs.txt |  7 ------
 fs/exofs/inode.c                  |  1 -
 fs/ext2/inode.c                   |  1 -
 fs/ext2/xip.c                     | 45 ---------------------------------------
 fs/ext2/xip.h                     |  3 ---
 fs/open.c                         |  5 +----
 include/linux/fs.h                |  2 --
 include/linux/rmap.h              |  2 +-
 mm/Makefile                       |  1 -
 mm/fadvise.c                      |  6 ++++--
 mm/filemap_xip.c                  | 24 ---------------------
 mm/madvise.c                      |  2 +-
 13 files changed, 7 insertions(+), 95 deletions(-)
 delete mode 100644 mm/filemap_xip.c

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index b30753cbf431..2ca3d17eee56 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -199,8 +199,6 @@ prototypes:
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
 	int (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
-	int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
-				unsigned long *);
 	int (*migratepage)(struct address_space *, struct page *, struct page *);
 	int (*launder_page)(struct page *);
 	int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
@@ -225,7 +223,6 @@ invalidatepage:		yes
 releasepage:		yes
 freepage:		yes
 direct_IO:
-get_xip_mem:					maybe
 migratepage:		yes (both)
 launder_page:		yes
 is_partially_uptodate:	yes
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 43ce0507ee25..966b22829f3b 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -591,8 +591,6 @@ struct address_space_operations {
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
 	ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
-	struct page* (*get_xip_page)(struct address_space *, sector_t,
-			int);
 	/* migrate the contents of a page to the specified target */
 	int (*migratepage) (struct page *, struct page *);
 	int (*launder_page) (struct page *);
@@ -748,11 +746,6 @@ struct address_space_operations {
         and transfer data directly between the storage and the
         application's address space.
 
-  get_xip_page: called by the VM to translate a block number to a page.
-	The page is valid until the corresponding filesystem is unmounted.
-	Filesystems that want to use execute-in-place (XIP) need to implement
-	it.  An example implementation can be found in fs/ext2/xip.c.
-
   migrate_page:  This is used to compact the physical memory usage.
         If the VM wants to relocate a page (maybe off a memory card
         that is signalling imminent failure) it will pass a new page
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 6fc91df99ff8..a198e94813fe 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = {
 	.direct_IO	= exofs_direct_IO,
 
 	/* With these NULL has special meaning or default is not exported */
-	.get_xip_mem	= NULL,
 	.migratepage	= NULL,
 	.launder_page	= NULL,
 	.is_partially_uptodate = NULL,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 5ac0a3461b3c..59d6c7d43740 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -894,7 +894,6 @@ const struct address_space_operations ext2_aops = {
 
 const struct address_space_operations ext2_aops_xip = {
 	.bmap			= ext2_bmap,
-	.get_xip_mem		= ext2_get_xip_mem,
 	.direct_IO		= ext2_direct_IO,
 };
 
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index 8cfca3a4cd58..132d4daf98c4 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -13,35 +13,6 @@
 #include "ext2.h"
 #include "xip.h"
 
-static inline long __inode_direct_access(struct inode *inode, sector_t block,
-				void **kaddr, unsigned long *pfn, long size)
-{
-	struct block_device *bdev = inode->i_sb->s_bdev;
-	sector_t sector = block * (PAGE_SIZE / 512);
-	return bdev_direct_access(bdev, sector, kaddr, pfn, size);
-}
-
-static inline int
-__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
-		   sector_t *result)
-{
-	struct buffer_head tmp;
-	int rc;
-
-	memset(&tmp, 0, sizeof(struct buffer_head));
-	tmp.b_size = 1 << inode->i_blkbits;
-	rc = ext2_get_block(inode, pgoff, &tmp, create);
-	*result = tmp.b_blocknr;
-
-	/* did we get a sparse block (hole in the file)? */
-	if (!tmp.b_blocknr && !rc) {
-		BUG_ON(create);
-		rc = -ENODATA;
-	}
-
-	return rc;
-}
-
 void ext2_xip_verify_sb(struct super_block *sb)
 {
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
@@ -54,19 +25,3 @@ void ext2_xip_verify_sb(struct super_block *sb)
 			     "not supported by bdev");
 	}
 }
-
-int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
-				void **kmem, unsigned long *pfn)
-{
-	long rc;
-	sector_t block;
-
-	/* first, retrieve the sector number */
-	rc = __ext2_get_block(mapping->host, pgoff, create, &block);
-	if (rc)
-		return rc;
-
-	/* retrieve address of the target data */
-	rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE);
-	return (rc < 0) ? rc : 0;
-}
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
index b2592f2f3c9d..e7b9f0a2cc54 100644
--- a/fs/ext2/xip.h
+++ b/fs/ext2/xip.h
@@ -12,10 +12,7 @@ static inline int ext2_use_xip (struct super_block *sb)
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	return (sbi->s_mount_opt & EXT2_MOUNT_XIP);
 }
-int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
-				void **, unsigned long *);
 #else
 #define ext2_xip_verify_sb(sb)			do { } while (0)
 #define ext2_use_xip(sb)			0
-#define ext2_get_xip_mem			NULL
 #endif
diff --git a/fs/open.c b/fs/open.c
index 813be037b412..a293c2020676 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -667,11 +667,8 @@ int open_check_o_direct(struct file *f)
 {
 	/* NB: we're sure to have correct a_ops only after f_op->open */
 	if (f->f_flags & O_DIRECT) {
-		if (!f->f_mapping->a_ops ||
-		    ((!f->f_mapping->a_ops->direct_IO) &&
-		    (!f->f_mapping->a_ops->get_xip_mem))) {
+		if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
 			return -EINVAL;
-		}
 	}
 	return 0;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2c8f9055af38..9772d655f444 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -362,8 +362,6 @@ struct address_space_operations {
 	int (*releasepage) (struct page *, gfp_t);
 	void (*freepage)(struct page *);
 	ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
-	int (*get_xip_mem)(struct address_space *, pgoff_t, int,
-						void **, unsigned long *);
 	/*
 	 * migrate the contents of a page to the specified target. If
 	 * migrate_mode is MIGRATE_ASYNC, it must not block.
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b38f559130d5..c4c559a45dc8 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -198,7 +198,7 @@ int page_referenced(struct page *, int is_locked,
 int try_to_unmap(struct page *, enum ttu_flags flags);
 
 /*
- * Called from mm/filemap_xip.c to unmap empty zero page
+ * Used by uprobes to replace a userspace page safely
  */
 pte_t *__page_check_address(struct page *, struct mm_struct *,
 				unsigned long, spinlock_t **, int);
diff --git a/mm/Makefile b/mm/Makefile
index 088c68e9ec35..3c1caa2693bd 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,7 +55,6 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
 obj-$(CONFIG_KASAN)	+= kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
-obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
diff --git a/mm/fadvise.c b/mm/fadvise.c
index fac23ecf8d72..4a3907cf79f8 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -28,6 +28,7 @@
 SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 {
 	struct fd f = fdget(fd);
+	struct inode *inode;
 	struct address_space *mapping;
 	struct backing_dev_info *bdi;
 	loff_t endbyte;			/* inclusive */
@@ -39,7 +40,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 	if (!f.file)
 		return -EBADF;
 
-	if (S_ISFIFO(file_inode(f.file)->i_mode)) {
+	inode = file_inode(f.file);
+	if (S_ISFIFO(inode->i_mode)) {
 		ret = -ESPIPE;
 		goto out;
 	}
@@ -50,7 +52,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		goto out;
 	}
 
-	if (mapping->a_ops->get_xip_mem) {
+	if (IS_DAX(inode)) {
 		switch (advice) {
 		case POSIX_FADV_NORMAL:
 		case POSIX_FADV_RANDOM:
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
deleted file mode 100644
index 8e3f99b61959..000000000000
--- a/mm/filemap_xip.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- *	linux/mm/filemap_xip.c
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte <cotte@de.ibm.com>
- *
- * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
- *
- */
-
-#include <linux/fs.h>
-#include <linux/backing-dev.h>
-#include <linux/pagemap.h>
-#include <linux/export.h>
-#include <linux/uio.h>
-#include <linux/rmap.h>
-#include <linux/mmu_notifier.h>
-#include <linux/sched.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-#include <linux/gfp.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-
diff --git a/mm/madvise.c b/mm/madvise.c
index 1077cbdc8b52..d551475517bf 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -239,7 +239,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
 		return -EBADF;
 #endif
 
-	if (file->f_mapping->a_ops->get_xip_mem) {
+	if (IS_DAX(file_inode(file))) {
 		/* no bad return value, but ignore advice */
 		return 0;
 	}
-- 
cgit v1.2.3


From 6cd176a51e52e5218b1aa97e1ec916bac25a9b7e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:59:25 -0800
Subject: vfs,ext2: remove CONFIG_EXT2_FS_XIP and rename CONFIG_FS_XIP to
 CONFIG_FS_DAX

The fewer Kconfig options we have the better.  Use the generic
CONFIG_FS_DAX to enable XIP support in ext2 as well as in the core.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig         | 21 ++++++++++++++-------
 fs/Makefile        |  2 +-
 fs/ext2/Kconfig    | 11 -----------
 fs/ext2/ext2.h     |  2 +-
 fs/ext2/file.c     |  4 ++--
 fs/ext2/super.c    |  4 ++--
 include/linux/fs.h |  2 +-
 scripts/diffconfig |  1 -
 8 files changed, 21 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index a6bb530b1ec5..5331497d5b25 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -13,13 +13,6 @@ if BLOCK
 source "fs/ext2/Kconfig"
 source "fs/ext3/Kconfig"
 source "fs/ext4/Kconfig"
-
-config FS_XIP
-# execute in place
-	bool
-	depends on EXT2_FS_XIP
-	default y
-
 source "fs/jbd/Kconfig"
 source "fs/jbd2/Kconfig"
 
@@ -40,6 +33,20 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
 
+config FS_DAX
+	bool "Direct Access (DAX) support"
+	depends on MMU
+	help
+	  Direct Access (DAX) can be used on memory-backed block devices.
+	  If the block device supports DAX and the filesystem supports DAX,
+	  then you can avoid using the pagecache to buffer I/Os.  Turning
+	  on this option will compile in support for DAX; you will need to
+	  mount the filesystem using the -o dax option.
+
+	  If you do not have a block device that is capable of using this,
+	  or if unsure, say N.  Saying Y will increase the size of the kernel
+	  by about 5kB.
+
 endif # BLOCK
 
 # Posix ACL utility routines
diff --git a/fs/Makefile b/fs/Makefile
index 0534444e257c..0f4635f7c49c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -28,7 +28,7 @@ obj-$(CONFIG_SIGNALFD)		+= signalfd.o
 obj-$(CONFIG_TIMERFD)		+= timerfd.o
 obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
-obj-$(CONFIG_FS_XIP)		+= dax.o
+obj-$(CONFIG_FS_DAX)		+= dax.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 14a6780fd034..c634874e12d9 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -42,14 +42,3 @@ config EXT2_FS_SECURITY
 
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
-
-config EXT2_FS_XIP
-	bool "Ext2 execute in place support"
-	depends on EXT2_FS && MMU
-	help
-	  Execute in place can be used on memory-backed block devices. If you
-	  enable this option, you can select to mount block devices which are
-	  capable of this feature without using the page cache.
-
-	  If you do not use a block device that is capable of using this,
-	  or if unsure, say N.
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 30604c4d70e6..6854038c09ae 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -380,7 +380,7 @@ struct ext2_inode {
 #define EXT2_MOUNT_NO_UID32		0x000200  /* Disable 32-bit UIDs */
 #define EXT2_MOUNT_XATTR_USER		0x004000  /* Extended user attributes */
 #define EXT2_MOUNT_POSIX_ACL		0x008000  /* POSIX Access Control Lists */
-#ifdef CONFIG_FS_XIP
+#ifdef CONFIG_FS_DAX
 #define EXT2_MOUNT_XIP			0x010000  /* Execute in place */
 #else
 #define EXT2_MOUNT_XIP			0
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a61c93fd9dce..de8174d1e973 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -25,7 +25,7 @@
 #include "xattr.h"
 #include "acl.h"
 
-#ifdef CONFIG_EXT2_FS_XIP
+#ifdef CONFIG_FS_DAX
 static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	return dax_fault(vma, vmf, ext2_get_block);
@@ -108,7 +108,7 @@ const struct file_operations ext2_file_operations = {
 	.splice_write	= iter_file_splice_write,
 };
 
-#ifdef CONFIG_EXT2_FS_XIP
+#ifdef CONFIG_FS_DAX
 const struct file_operations ext2_xip_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 50342583db1f..5f029d8c3a02 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -291,7 +291,7 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",grpquota");
 #endif
 
-#if defined(CONFIG_EXT2_FS_XIP)
+#ifdef CONFIG_FS_DAX
 	if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
 		seq_puts(seq, ",xip");
 #endif
@@ -558,7 +558,7 @@ static int parse_options(char *options, struct super_block *sb)
 			break;
 #endif
 		case Opt_xip:
-#ifdef CONFIG_EXT2_FS_XIP
+#ifdef CONFIG_FS_DAX
 			set_opt (sbi->s_mount_opt, XIP);
 #else
 			ext2_msg(sb, KERN_INFO, "xip option not supported");
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9772d655f444..d46f8fe6a0ea 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1676,7 +1676,7 @@ struct super_operations {
 #define S_IMA		1024	/* Inode has an associated IMA struct */
 #define S_AUTOMOUNT	2048	/* Automount/referral quasi-directory */
 #define S_NOSEC		4096	/* no suid or xattr security attributes */
-#ifdef CONFIG_FS_XIP
+#ifdef CONFIG_FS_DAX
 #define S_DAX		8192	/* Direct Access, avoiding the page cache */
 #else
 #define S_DAX		0	/* Make all the DAX code disappear */
diff --git a/scripts/diffconfig b/scripts/diffconfig
index 6d672836e187..0db267d0adc9 100755
--- a/scripts/diffconfig
+++ b/scripts/diffconfig
@@ -28,7 +28,6 @@ If no config files are specified, .config and .config.old are used.
 Example usage:
  $ diffconfig .config config-with-some-changes
 -EXT2_FS_XATTR  n
--EXT2_FS_XIP  n
  CRAMFS  n -> y
  EXT2_FS  y -> n
  LOG_BUF_SHIFT  14 -> 16
-- 
cgit v1.2.3


From 25726bc15731d42112b579cf73f30edbc43d3973 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:59:35 -0800
Subject: dax: add dax_zero_page_range

This new function allows us to support hole-punch for DAX files by zeroing
a partial page, as opposed to the dax_truncate_page() function which can
only truncate to the end of the page.  Reimplement dax_truncate_page() to
call dax_zero_page_range().

[ross.zwisler@linux.intel.com: ported to 3.13-rc2]
[akpm@linux-foundation.org: fix typos in comments]
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/dax.txt |  1 +
 fs/dax.c                          | 38 ++++++++++++++++++++++++++++++++------
 include/linux/fs.h                |  1 +
 3 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 635adaa1e425..ebcd97f03654 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -62,6 +62,7 @@ Filesystem support consists of
   for fault and page_mkwrite (which should probably call dax_fault() and
   dax_mkwrite(), passing the appropriate get_block() callback)
 - calling dax_truncate_page() instead of block_truncate_page() for DAX files
+- calling dax_zero_page_range() instead of zero_user() for DAX files
 - ensuring that there is sufficient locking between reads, writes,
   truncates and page faults
 
diff --git a/fs/dax.c b/fs/dax.c
index ebf9b2231b0f..ed1619ec6537 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -464,31 +464,35 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 EXPORT_SYMBOL_GPL(dax_fault);
 
 /**
- * dax_truncate_page - handle a partial page being truncated in a DAX file
+ * dax_zero_page_range - zero a range within a page of a DAX file
  * @inode: The file being truncated
  * @from: The file offset that is being truncated to
+ * @length: The number of bytes to zero
  * @get_block: The filesystem method used to translate file offsets to blocks
  *
- * Similar to block_truncate_page(), this function can be called by a
- * filesystem when it is truncating an DAX file to handle the partial page.
+ * This function can be called by a filesystem when it is zeroing part of a
+ * page in a DAX file.  This is intended for hole-punch operations.  If
+ * you are truncating a file, the helper function dax_truncate_page() may be
+ * more convenient.
  *
  * We work in terms of PAGE_CACHE_SIZE here for commonality with
  * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
  * took care of disposing of the unnecessary blocks.  Even if the filesystem
  * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
- * since the file might be mmaped.
+ * since the file might be mmapped.
  */
-int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
+							get_block_t get_block)
 {
 	struct buffer_head bh;
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned length = PAGE_CACHE_ALIGN(from) - from;
 	int err;
 
 	/* Block boundary? Nothing to do */
 	if (!length)
 		return 0;
+	BUG_ON((offset + length) > PAGE_CACHE_SIZE);
 
 	memset(&bh, 0, sizeof(bh));
 	bh.b_size = PAGE_CACHE_SIZE;
@@ -505,4 +509,26 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(dax_zero_page_range);
+
+/**
+ * dax_truncate_page - handle a partial page being truncated in a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * Similar to block_truncate_page(), this function can be called by a
+ * filesystem when it is truncating a DAX file to handle the partial page.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks.  Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+{
+	unsigned length = PAGE_CACHE_ALIGN(from) - from;
+	return dax_zero_page_range(inode, from, length, get_block);
+}
 EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d46f8fe6a0ea..ed5a0900b94d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2589,6 +2589,7 @@ extern int nonseekable_open(struct inode * inode, struct file * filp);
 ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
 		loff_t, get_block_t, dio_iodone_t, int flags);
 int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
-- 
cgit v1.2.3


From aaaf5fbf56f16c81a653713cc333b18ad6e25ea9 Mon Sep 17 00:00:00 2001
From: Joshua Kinard <kumba@gentoo.org>
Date: Mon, 16 Feb 2015 16:00:26 -0800
Subject: rtc: add driver for DS1685 family of real time clocks

This adds a driver for the Dallas/Maxim DS1685-family of RTC chips.  It
supports the DS1685/DS1687, DS1688/DS1691, DS1689/DS1693, DS17285/DS17287,
DS17485/DS17487, and DS17885/DS17887 RTC chips.  These chips are commonly
found in SGI O2 and SGI Octane systems.  It was originally derived from a
driver patch submitted by Matthias Fuchs many years ago for use in
EPPC-405-UC modules, which also used these RTCs.  In addition to the
time-keeping functions, this RTC also handles the shutdown mechanism of
the O2 and Octane and acts as a partial NVRAM for the boot PROMS in these
systems.

Verified on both an SGI O2 and an SGI Octane.

Signed-off-by: Joshua Kinard <kumba@gentoo.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS                |    6 +
 drivers/rtc/Kconfig        |   90 ++
 drivers/rtc/Makefile       |    1 +
 drivers/rtc/rtc-ds1685.c   | 2252 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtc/ds1685.h |  375 ++++++++
 5 files changed, 2724 insertions(+)
 create mode 100644 drivers/rtc/rtc-ds1685.c
 create mode 100644 include/linux/rtc/ds1685.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 6f2c849e6da3..d5ff4c3cdbac 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2963,6 +2963,12 @@ S:	Supported
 F:	drivers/input/touchscreen/cyttsp*
 F:	include/linux/input/cyttsp.h
 
+DALLAS/MAXIM DS1685-FAMILY REAL TIME CLOCK
+M:	Joshua Kinard <kumba@gentoo.org>
+S:	Maintained
+F:	drivers/rtc/rtc-ds1685.c
+F:	include/linux/rtc/ds1685.h
+
 DAMA SLAVE for AX.25
 M:	Joerg Reuter <jreuter@yaina.de>
 W:	http://yaina.de/jreuter/
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 3bc9ddbe5cf7..0cf2e1d9cb17 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -801,6 +801,96 @@ config RTC_DRV_DS1553
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-ds1553.
 
+config RTC_DRV_DS1685_FAMILY
+	tristate "Dallas/Maxim DS1685 Family"
+	help
+	  If you say yes here you get support for the Dallas/Maxim DS1685
+	  family of real time chips.  This family includes the DS1685/DS1687,
+	  DS1689/DS1693, DS17285/DS17287, DS17485/DS17487, and
+	  DS17885/DS17887 chips.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-ds1685.
+
+choice
+	prompt "Subtype"
+	depends on RTC_DRV_DS1685_FAMILY
+	default RTC_DRV_DS1685
+
+config RTC_DRV_DS1685
+	bool "DS1685/DS1687"
+	help
+	  This enables support for the Dallas/Maxim DS1685/DS1687 real time
+	  clock chip.
+
+	  This chip is commonly found in SGI O2 (IP32) and SGI Octane (IP30)
+	  systems, as well as EPPC-405-UC modules by electronic system design
+	  GmbH.
+
+config RTC_DRV_DS1689
+	bool "DS1689/DS1693"
+	help
+	  This enables support for the Dallas/Maxim DS1689/DS1693 real time
+	  clock chip.
+
+	  This is an older RTC chip, supplanted by the DS1685/DS1687 above,
+	  which supports a few minor features such as Vcc, Vbat, and Power
+	  Cycle counters, plus a customer-specific, 8-byte ROM/Serial number.
+
+	  It also works for the even older DS1688/DS1691 RTC chips, which are
+	  virtually the same and carry the same model number.  Both chips
+	  have 114 bytes of user NVRAM.
+
+config RTC_DRV_DS17285
+	bool "DS17285/DS17287"
+	help
+	  This enables support for the Dallas/Maxim DS17285/DS17287 real time
+	  clock chip.
+
+	  This chip features 2kb of extended NV-SRAM.  It may possibly be
+	  found in some SGI O2 systems (rare).
+
+config RTC_DRV_DS17485
+	bool "DS17485/DS17487"
+	help
+	  This enables support for the Dallas/Maxim DS17485/DS17487 real time
+	  clock chip.
+
+	  This chip features 4kb of extended NV-SRAM.
+
+config RTC_DRV_DS17885
+	bool "DS17885/DS17887"
+	help
+	  This enables support for the Dallas/Maxim DS17885/DS17887 real time
+	  clock chip.
+
+	  This chip features 8kb of extended NV-SRAM.
+
+endchoice
+
+config RTC_DS1685_PROC_REGS
+	bool "Display register values in /proc"
+	depends on RTC_DRV_DS1685_FAMILY && PROC_FS
+	help
+	  Enable this to display a readout of all of the RTC registers in
+	  /proc/drivers/rtc.  Keep in mind that this can potentially lead
+	  to lost interrupts, as reading Control Register C will clear
+	  all pending IRQ flags.
+
+	  Unless you are debugging this driver, choose N.
+
+config RTC_DS1685_SYSFS_REGS
+	bool "SysFS access to RTC register bits"
+	depends on RTC_DRV_DS1685_FAMILY && SYSFS
+	help
+	  Enable this to provide access to the RTC control register bits
+	  in /sys.  Some of the bits are read-write, others are read-only.
+
+	  Keep in mind that reading Control C's bits automatically clears
+	  all pending IRQ flags - this can cause lost interrupts.
+
+	  If you know that you need access to these bits, choose Y, Else N.
+
 config RTC_DRV_DS1742
 	tristate "Maxim/Dallas DS1742/1743"
 	depends on HAS_IOMEM
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 99ded8b75e95..69c87062b098 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_RTC_DRV_DS1390)	+= rtc-ds1390.o
 obj-$(CONFIG_RTC_DRV_DS1511)	+= rtc-ds1511.o
 obj-$(CONFIG_RTC_DRV_DS1553)	+= rtc-ds1553.o
 obj-$(CONFIG_RTC_DRV_DS1672)	+= rtc-ds1672.o
+obj-$(CONFIG_RTC_DRV_DS1685_FAMILY)	+= rtc-ds1685.o
 obj-$(CONFIG_RTC_DRV_DS1742)	+= rtc-ds1742.o
 obj-$(CONFIG_RTC_DRV_DS2404)    += rtc-ds2404.o
 obj-$(CONFIG_RTC_DRV_DS3232)	+= rtc-ds3232.o
diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c
new file mode 100644
index 000000000000..8c3bfcb115b7
--- /dev/null
+++ b/drivers/rtc/rtc-ds1685.c
@@ -0,0 +1,2252 @@
+/*
+ * An rtc driver for the Dallas/Maxim DS1685/DS1687 and related real-time
+ * chips.
+ *
+ * Copyright (C) 2011-2014 Joshua Kinard <kumba@gentoo.org>.
+ * Copyright (C) 2009 Matthias Fuchs <matthias.fuchs@esd-electronics.com>.
+ *
+ * References:
+ *    DS1685/DS1687 3V/5V Real-Time Clocks, 19-5215, Rev 4/10.
+ *    DS17x85/DS17x87 3V/5V Real-Time Clocks, 19-5222, Rev 4/10.
+ *    DS1689/DS1693 3V/5V Serialized Real-Time Clocks, Rev 112105.
+ *    Application Note 90, Using the Multiplex Bus RTC Extended Features.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/bcd.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+#include <linux/workqueue.h>
+
+#include <linux/rtc/ds1685.h>
+
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#endif
+
+#define DRV_VERSION	"0.42.0"
+
+
+/* ----------------------------------------------------------------------- */
+/* Standard read/write functions if platform does not provide overrides */
+
+/**
+ * ds1685_read - read a value from an rtc register.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @reg: the register address to read.
+ */
+static u8
+ds1685_read(struct ds1685_priv *rtc, int reg)
+{
+	return readb((u8 __iomem *)rtc->regs +
+		     (reg * rtc->regstep));
+}
+
+/**
+ * ds1685_write - write a value to an rtc register.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @reg: the register address to write.
+ * @value: value to write to the register.
+ */
+static void
+ds1685_write(struct ds1685_priv *rtc, int reg, u8 value)
+{
+	writeb(value, ((u8 __iomem *)rtc->regs +
+		       (reg * rtc->regstep)));
+}
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* Inlined functions */
+
+/**
+ * ds1685_rtc_bcd2bin - bcd2bin wrapper in case platform doesn't support BCD.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @val: u8 time value to consider converting.
+ * @bcd_mask: u8 mask value if BCD mode is used.
+ * @bin_mask: u8 mask value if BIN mode is used.
+ *
+ * Returns the value, converted to BIN if originally in BCD and bcd_mode TRUE.
+ */
+static inline u8
+ds1685_rtc_bcd2bin(struct ds1685_priv *rtc, u8 val, u8 bcd_mask, u8 bin_mask)
+{
+	if (rtc->bcd_mode)
+		return (bcd2bin(val) & bcd_mask);
+
+	return (val & bin_mask);
+}
+
+/**
+ * ds1685_rtc_bin2bcd - bin2bcd wrapper in case platform doesn't support BCD.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @val: u8 time value to consider converting.
+ * @bin_mask: u8 mask value if BIN mode is used.
+ * @bcd_mask: u8 mask value if BCD mode is used.
+ *
+ * Returns the value, converted to BCD if originally in BIN and bcd_mode TRUE.
+ */
+static inline u8
+ds1685_rtc_bin2bcd(struct ds1685_priv *rtc, u8 val, u8 bin_mask, u8 bcd_mask)
+{
+	if (rtc->bcd_mode)
+		return (bin2bcd(val) & bcd_mask);
+
+	return (val & bin_mask);
+}
+
+/**
+ * ds1685_rtc_switch_to_bank0 - switch the rtc to bank 0.
+ * @rtc: pointer to the ds1685 rtc structure.
+ */
+static inline void
+ds1685_rtc_switch_to_bank0(struct ds1685_priv *rtc)
+{
+	rtc->write(rtc, RTC_CTRL_A,
+		   (rtc->read(rtc, RTC_CTRL_A) & ~(RTC_CTRL_A_DV0)));
+}
+
+/**
+ * ds1685_rtc_switch_to_bank1 - switch the rtc to bank 1.
+ * @rtc: pointer to the ds1685 rtc structure.
+ */
+static inline void
+ds1685_rtc_switch_to_bank1(struct ds1685_priv *rtc)
+{
+	rtc->write(rtc, RTC_CTRL_A,
+		   (rtc->read(rtc, RTC_CTRL_A) | RTC_CTRL_A_DV0));
+}
+
+/**
+ * ds1685_rtc_begin_data_access - prepare the rtc for data access.
+ * @rtc: pointer to the ds1685 rtc structure.
+ *
+ * This takes several steps to prepare the rtc for access to get/set time
+ * and alarm values from the rtc registers:
+ *  - Sets the SET bit in Control Register B.
+ *  - Reads Ext Control Register 4A and checks the INCR bit.
+ *  - If INCR is active, a short delay is added before Ext Control Register 4A
+ *    is read again in a loop until INCR is inactive.
+ *  - Switches the rtc to bank 1.  This allows access to all relevant
+ *    data for normal rtc operation, as bank 0 contains only the nvram.
+ */
+static inline void
+ds1685_rtc_begin_data_access(struct ds1685_priv *rtc)
+{
+	/* Set the SET bit in Ctrl B */
+	rtc->write(rtc, RTC_CTRL_B,
+		   (rtc->read(rtc, RTC_CTRL_B) | RTC_CTRL_B_SET));
+
+	/* Read Ext Ctrl 4A and check the INCR bit to avoid a lockout. */
+	while (rtc->read(rtc, RTC_EXT_CTRL_4A) & RTC_CTRL_4A_INCR)
+		cpu_relax();
+
+	/* Switch to Bank 1 */
+	ds1685_rtc_switch_to_bank1(rtc);
+}
+
+/**
+ * ds1685_rtc_end_data_access - end data access on the rtc.
+ * @rtc: pointer to the ds1685 rtc structure.
+ *
+ * This ends what was started by ds1685_rtc_begin_data_access:
+ *  - Switches the rtc back to bank 0.
+ *  - Clears the SET bit in Control Register B.
+ */
+static inline void
+ds1685_rtc_end_data_access(struct ds1685_priv *rtc)
+{
+	/* Switch back to Bank 0 */
+	ds1685_rtc_switch_to_bank1(rtc);
+
+	/* Clear the SET bit in Ctrl B */
+	rtc->write(rtc, RTC_CTRL_B,
+		   (rtc->read(rtc, RTC_CTRL_B) & ~(RTC_CTRL_B_SET)));
+}
+
+/**
+ * ds1685_rtc_begin_ctrl_access - prepare the rtc for ctrl access.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @flags: irq flags variable for spin_lock_irqsave.
+ *
+ * This takes several steps to prepare the rtc for access to read just the
+ * control registers:
+ *  - Sets a spinlock on the rtc IRQ.
+ *  - Switches the rtc to bank 1.  This allows access to the two extended
+ *    control registers.
+ *
+ * Only use this where you are certain another lock will not be held.
+ */
+static inline void
+ds1685_rtc_begin_ctrl_access(struct ds1685_priv *rtc, unsigned long flags)
+{
+	spin_lock_irqsave(&rtc->lock, flags);
+	ds1685_rtc_switch_to_bank1(rtc);
+}
+
+/**
+ * ds1685_rtc_end_ctrl_access - end ctrl access on the rtc.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @flags: irq flags variable for spin_unlock_irqrestore.
+ *
+ * This ends what was started by ds1685_rtc_begin_ctrl_access:
+ *  - Switches the rtc back to bank 0.
+ *  - Unsets the spinlock on the rtc IRQ.
+ */
+static inline void
+ds1685_rtc_end_ctrl_access(struct ds1685_priv *rtc, unsigned long flags)
+{
+	ds1685_rtc_switch_to_bank0(rtc);
+	spin_unlock_irqrestore(&rtc->lock, flags);
+}
+
+/**
+ * ds1685_rtc_get_ssn - retrieve the silicon serial number.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @ssn: u8 array to hold the bits of the silicon serial number.
+ *
+ * This number starts at 0x40, and is 8-bytes long, ending at 0x47. The
+ * first byte is the model number, the next six bytes are the serial number
+ * digits, and the final byte is a CRC check byte.  Together, they form the
+ * silicon serial number.
+ *
+ * These values are stored in bank1, so ds1685_rtc_switch_to_bank1 must be
+ * called first before calling this function, else data will be read out of
+ * the bank0 NVRAM.  Be sure to call ds1685_rtc_switch_to_bank0 when done.
+ */
+static inline void
+ds1685_rtc_get_ssn(struct ds1685_priv *rtc, u8 *ssn)
+{
+	ssn[0] = rtc->read(rtc, RTC_BANK1_SSN_MODEL);
+	ssn[1] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_1);
+	ssn[2] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_2);
+	ssn[3] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_3);
+	ssn[4] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_4);
+	ssn[5] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_5);
+	ssn[6] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_6);
+	ssn[7] = rtc->read(rtc, RTC_BANK1_SSN_CRC);
+}
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* Read/Set Time & Alarm functions */
+
+/**
+ * ds1685_rtc_read_time - reads the time registers.
+ * @dev: pointer to device structure.
+ * @tm: pointer to rtc_time structure.
+ */
+static int
+ds1685_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrlb, century;
+	u8 seconds, minutes, hours, wday, mday, month, years;
+
+	/* Fetch the time info from the RTC registers. */
+	ds1685_rtc_begin_data_access(rtc);
+	seconds = rtc->read(rtc, RTC_SECS);
+	minutes = rtc->read(rtc, RTC_MINS);
+	hours   = rtc->read(rtc, RTC_HRS);
+	wday    = rtc->read(rtc, RTC_WDAY);
+	mday    = rtc->read(rtc, RTC_MDAY);
+	month   = rtc->read(rtc, RTC_MONTH);
+	years   = rtc->read(rtc, RTC_YEAR);
+	century = rtc->read(rtc, RTC_CENTURY);
+	ctrlb   = rtc->read(rtc, RTC_CTRL_B);
+	ds1685_rtc_end_data_access(rtc);
+
+	/* bcd2bin if needed, perform fixups, and store to rtc_time. */
+	years        = ds1685_rtc_bcd2bin(rtc, years, RTC_YEAR_BCD_MASK,
+					  RTC_YEAR_BIN_MASK);
+	century      = ds1685_rtc_bcd2bin(rtc, century, RTC_CENTURY_MASK,
+					  RTC_CENTURY_MASK);
+	tm->tm_sec   = ds1685_rtc_bcd2bin(rtc, seconds, RTC_SECS_BCD_MASK,
+					  RTC_SECS_BIN_MASK);
+	tm->tm_min   = ds1685_rtc_bcd2bin(rtc, minutes, RTC_MINS_BCD_MASK,
+					  RTC_MINS_BIN_MASK);
+	tm->tm_hour  = ds1685_rtc_bcd2bin(rtc, hours, RTC_HRS_24_BCD_MASK,
+					  RTC_HRS_24_BIN_MASK);
+	tm->tm_wday  = (ds1685_rtc_bcd2bin(rtc, wday, RTC_WDAY_MASK,
+					   RTC_WDAY_MASK) - 1);
+	tm->tm_mday  = ds1685_rtc_bcd2bin(rtc, mday, RTC_MDAY_BCD_MASK,
+					  RTC_MDAY_BIN_MASK);
+	tm->tm_mon   = (ds1685_rtc_bcd2bin(rtc, month, RTC_MONTH_BCD_MASK,
+					   RTC_MONTH_BIN_MASK) - 1);
+	tm->tm_year  = ((years + (century * 100)) - 1900);
+	tm->tm_yday  = rtc_year_days(tm->tm_mday, tm->tm_mon, tm->tm_year);
+	tm->tm_isdst = 0; /* RTC has hardcoded timezone, so don't use. */
+
+	return rtc_valid_tm(tm);
+}
+
+/**
+ * ds1685_rtc_set_time - sets the time registers.
+ * @dev: pointer to device structure.
+ * @tm: pointer to rtc_time structure.
+ */
+static int
+ds1685_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrlb, seconds, minutes, hours, wday, mday, month, years, century;
+
+	/* Fetch the time info from rtc_time. */
+	seconds = ds1685_rtc_bin2bcd(rtc, tm->tm_sec, RTC_SECS_BIN_MASK,
+				     RTC_SECS_BCD_MASK);
+	minutes = ds1685_rtc_bin2bcd(rtc, tm->tm_min, RTC_MINS_BIN_MASK,
+				     RTC_MINS_BCD_MASK);
+	hours   = ds1685_rtc_bin2bcd(rtc, tm->tm_hour, RTC_HRS_24_BIN_MASK,
+				     RTC_HRS_24_BCD_MASK);
+	wday    = ds1685_rtc_bin2bcd(rtc, (tm->tm_wday + 1), RTC_WDAY_MASK,
+				     RTC_WDAY_MASK);
+	mday    = ds1685_rtc_bin2bcd(rtc, tm->tm_mday, RTC_MDAY_BIN_MASK,
+				     RTC_MDAY_BCD_MASK);
+	month   = ds1685_rtc_bin2bcd(rtc, (tm->tm_mon + 1), RTC_MONTH_BIN_MASK,
+				     RTC_MONTH_BCD_MASK);
+	years   = ds1685_rtc_bin2bcd(rtc, (tm->tm_year % 100),
+				     RTC_YEAR_BIN_MASK, RTC_YEAR_BCD_MASK);
+	century = ds1685_rtc_bin2bcd(rtc, ((tm->tm_year + 1900) / 100),
+				     RTC_CENTURY_MASK, RTC_CENTURY_MASK);
+
+	/*
+	 * Perform Sanity Checks:
+	 *   - Months: !> 12, Month Day != 0.
+	 *   - Month Day !> Max days in current month.
+	 *   - Hours !>= 24, Mins !>= 60, Secs !>= 60, & Weekday !> 7.
+	 */
+	if ((tm->tm_mon > 11) || (mday == 0))
+		return -EDOM;
+
+	if (tm->tm_mday > rtc_month_days(tm->tm_mon, tm->tm_year))
+		return -EDOM;
+
+	if ((tm->tm_hour >= 24) || (tm->tm_min >= 60) ||
+	    (tm->tm_sec >= 60)  || (wday > 7))
+		return -EDOM;
+
+	/*
+	 * Set the data mode to use and store the time values in the
+	 * RTC registers.
+	 */
+	ds1685_rtc_begin_data_access(rtc);
+	ctrlb = rtc->read(rtc, RTC_CTRL_B);
+	if (rtc->bcd_mode)
+		ctrlb &= ~(RTC_CTRL_B_DM);
+	else
+		ctrlb |= RTC_CTRL_B_DM;
+	rtc->write(rtc, RTC_CTRL_B, ctrlb);
+	rtc->write(rtc, RTC_SECS, seconds);
+	rtc->write(rtc, RTC_MINS, minutes);
+	rtc->write(rtc, RTC_HRS, hours);
+	rtc->write(rtc, RTC_WDAY, wday);
+	rtc->write(rtc, RTC_MDAY, mday);
+	rtc->write(rtc, RTC_MONTH, month);
+	rtc->write(rtc, RTC_YEAR, years);
+	rtc->write(rtc, RTC_CENTURY, century);
+	ds1685_rtc_end_data_access(rtc);
+
+	return 0;
+}
+
+/**
+ * ds1685_rtc_read_alarm - reads the alarm registers.
+ * @dev: pointer to device structure.
+ * @alrm: pointer to rtc_wkalrm structure.
+ *
+ * There are three primary alarm registers: seconds, minutes, and hours.
+ * A fourth alarm register for the month date is also available in bank1 for
+ * kickstart/wakeup features.  The DS1685/DS1687 manual states that a
+ * "don't care" value ranging from 0xc0 to 0xff may be written into one or
+ * more of the three alarm bytes to act as a wildcard value.  The fourth
+ * byte doesn't support a "don't care" value.
+ */
+static int
+ds1685_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 seconds, minutes, hours, mday, ctrlb, ctrlc;
+
+	/* Fetch the alarm info from the RTC alarm registers. */
+	ds1685_rtc_begin_data_access(rtc);
+	seconds	= rtc->read(rtc, RTC_SECS_ALARM);
+	minutes	= rtc->read(rtc, RTC_MINS_ALARM);
+	hours	= rtc->read(rtc, RTC_HRS_ALARM);
+	mday	= rtc->read(rtc, RTC_MDAY_ALARM);
+	ctrlb	= rtc->read(rtc, RTC_CTRL_B);
+	ctrlc	= rtc->read(rtc, RTC_CTRL_C);
+	ds1685_rtc_end_data_access(rtc);
+
+	/* Check month date. */
+	if (!(mday >= 1) && (mday <= 31))
+		return -EDOM;
+
+	/*
+	 * Check the three alarm bytes.
+	 *
+	 * The Linux RTC system doesn't support the "don't care" capability
+	 * of this RTC chip.  We check for it anyways in case support is
+	 * added in the future.
+	 */
+	if (unlikely((seconds >= 0xc0) && (seconds <= 0xff)))
+		alrm->time.tm_sec = -1;
+	else
+		alrm->time.tm_sec = ds1685_rtc_bcd2bin(rtc, seconds,
+						       RTC_SECS_BCD_MASK,
+						       RTC_SECS_BIN_MASK);
+
+	if (unlikely((minutes >= 0xc0) && (minutes <= 0xff)))
+		alrm->time.tm_min = -1;
+	else
+		alrm->time.tm_min = ds1685_rtc_bcd2bin(rtc, minutes,
+						       RTC_MINS_BCD_MASK,
+						       RTC_MINS_BIN_MASK);
+
+	if (unlikely((hours >= 0xc0) && (hours <= 0xff)))
+		alrm->time.tm_hour = -1;
+	else
+		alrm->time.tm_hour = ds1685_rtc_bcd2bin(rtc, hours,
+							RTC_HRS_24_BCD_MASK,
+							RTC_HRS_24_BIN_MASK);
+
+	/* Write the data to rtc_wkalrm. */
+	alrm->time.tm_mday = ds1685_rtc_bcd2bin(rtc, mday, RTC_MDAY_BCD_MASK,
+						RTC_MDAY_BIN_MASK);
+	alrm->time.tm_mon = -1;
+	alrm->time.tm_year = -1;
+	alrm->time.tm_wday = -1;
+	alrm->time.tm_yday = -1;
+	alrm->time.tm_isdst = -1;
+	alrm->enabled = !!(ctrlb & RTC_CTRL_B_AIE);
+	alrm->pending = !!(ctrlc & RTC_CTRL_C_AF);
+
+	return 0;
+}
+
+/**
+ * ds1685_rtc_set_alarm - sets the alarm in registers.
+ * @dev: pointer to device structure.
+ * @alrm: pointer to rtc_wkalrm structure.
+ */
+static int
+ds1685_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrlb, seconds, minutes, hours, mday;
+
+	/* Fetch the alarm info and convert to BCD. */
+	seconds	= ds1685_rtc_bin2bcd(rtc, alrm->time.tm_sec,
+				     RTC_SECS_BIN_MASK,
+				     RTC_SECS_BCD_MASK);
+	minutes	= ds1685_rtc_bin2bcd(rtc, alrm->time.tm_min,
+				     RTC_MINS_BIN_MASK,
+				     RTC_MINS_BCD_MASK);
+	hours	= ds1685_rtc_bin2bcd(rtc, alrm->time.tm_hour,
+				     RTC_HRS_24_BIN_MASK,
+				     RTC_HRS_24_BCD_MASK);
+	mday	= ds1685_rtc_bin2bcd(rtc, alrm->time.tm_mday,
+				     RTC_MDAY_BIN_MASK,
+				     RTC_MDAY_BCD_MASK);
+
+	/* Check the month date for validity. */
+	if (!(mday >= 1) && (mday <= 31))
+		return -EDOM;
+
+	/*
+	 * Check the three alarm bytes.
+	 *
+	 * The Linux RTC system doesn't support the "don't care" capability
+	 * of this RTC chip because rtc_valid_tm tries to validate every
+	 * field, and we only support four fields.  We put the support
+	 * here anyways for the future.
+	 */
+	if (unlikely((seconds >= 0xc0) && (seconds <= 0xff)))
+		seconds = 0xff;
+
+	if (unlikely((minutes >= 0xc0) && (minutes <= 0xff)))
+		minutes = 0xff;
+
+	if (unlikely((hours >= 0xc0) && (hours <= 0xff)))
+		hours = 0xff;
+
+	alrm->time.tm_mon	= -1;
+	alrm->time.tm_year	= -1;
+	alrm->time.tm_wday	= -1;
+	alrm->time.tm_yday	= -1;
+	alrm->time.tm_isdst	= -1;
+
+	/* Disable the alarm interrupt first. */
+	ds1685_rtc_begin_data_access(rtc);
+	ctrlb = rtc->read(rtc, RTC_CTRL_B);
+	rtc->write(rtc, RTC_CTRL_B, (ctrlb & ~(RTC_CTRL_B_AIE)));
+
+	/* Read ctrlc to clear RTC_CTRL_C_AF. */
+	rtc->read(rtc, RTC_CTRL_C);
+
+	/*
+	 * Set the data mode to use and store the time values in the
+	 * RTC registers.
+	 */
+	ctrlb = rtc->read(rtc, RTC_CTRL_B);
+	if (rtc->bcd_mode)
+		ctrlb &= ~(RTC_CTRL_B_DM);
+	else
+		ctrlb |= RTC_CTRL_B_DM;
+	rtc->write(rtc, RTC_CTRL_B, ctrlb);
+	rtc->write(rtc, RTC_SECS_ALARM, seconds);
+	rtc->write(rtc, RTC_MINS_ALARM, minutes);
+	rtc->write(rtc, RTC_HRS_ALARM, hours);
+	rtc->write(rtc, RTC_MDAY_ALARM, mday);
+
+	/* Re-enable the alarm if needed. */
+	if (alrm->enabled) {
+		ctrlb = rtc->read(rtc, RTC_CTRL_B);
+		ctrlb |= RTC_CTRL_B_AIE;
+		rtc->write(rtc, RTC_CTRL_B, ctrlb);
+	}
+
+	/* Done! */
+	ds1685_rtc_end_data_access(rtc);
+
+	return 0;
+}
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* /dev/rtcX Interface functions */
+
+#ifdef CONFIG_RTC_INTF_DEV
+/**
+ * ds1685_rtc_alarm_irq_enable - replaces ioctl() RTC_AIE on/off.
+ * @dev: pointer to device structure.
+ * @enabled: flag indicating whether to enable or disable.
+ */
+static int
+ds1685_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
+{
+	struct ds1685_priv *rtc = dev_get_drvdata(dev);
+	unsigned long flags = 0;
+
+	/* Enable/disable the Alarm IRQ-Enable flag. */
+	spin_lock_irqsave(&rtc->lock, flags);
+
+	/* Flip the requisite interrupt-enable bit. */
+	if (enabled)
+		rtc->write(rtc, RTC_CTRL_B, (rtc->read(rtc, RTC_CTRL_B) |
+					     RTC_CTRL_B_AIE));
+	else
+		rtc->write(rtc, RTC_CTRL_B, (rtc->read(rtc, RTC_CTRL_B) &
+					     ~(RTC_CTRL_B_AIE)));
+
+	/* Read Control C to clear all the flag bits. */
+	rtc->read(rtc, RTC_CTRL_C);
+	spin_unlock_irqrestore(&rtc->lock, flags);
+
+	return 0;
+}
+#endif
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* IRQ handler & workqueue. */
+
+/**
+ * ds1685_rtc_irq_handler - IRQ handler.
+ * @irq: IRQ number.
+ * @dev_id: platform device pointer.
+ */
+static irqreturn_t
+ds1685_rtc_irq_handler(int irq, void *dev_id)
+{
+	struct platform_device *pdev = dev_id;
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrlb, ctrlc;
+	unsigned long events = 0;
+	u8 num_irqs = 0;
+
+	/* Abort early if the device isn't ready yet (i.e., DEBUG_SHIRQ). */
+	if (unlikely(!rtc))
+		return IRQ_HANDLED;
+
+	/* Ctrlb holds the interrupt-enable bits and ctrlc the flag bits. */
+	spin_lock(&rtc->lock);
+	ctrlb = rtc->read(rtc, RTC_CTRL_B);
+	ctrlc = rtc->read(rtc, RTC_CTRL_C);
+
+	/* Is the IRQF bit set? */
+	if (likely(ctrlc & RTC_CTRL_C_IRQF)) {
+		/*
+		 * We need to determine if it was one of the standard
+		 * events: PF, AF, or UF.  If so, we handle them and
+		 * update the RTC core.
+		 */
+		if (likely(ctrlc & RTC_CTRL_B_PAU_MASK)) {
+			events = RTC_IRQF;
+
+			/* Check for a periodic interrupt. */
+			if ((ctrlb & RTC_CTRL_B_PIE) &&
+			    (ctrlc & RTC_CTRL_C_PF)) {
+				events |= RTC_PF;
+				num_irqs++;
+			}
+
+			/* Check for an alarm interrupt. */
+			if ((ctrlb & RTC_CTRL_B_AIE) &&
+			    (ctrlc & RTC_CTRL_C_AF)) {
+				events |= RTC_AF;
+				num_irqs++;
+			}
+
+			/* Check for an update interrupt. */
+			if ((ctrlb & RTC_CTRL_B_UIE) &&
+			    (ctrlc & RTC_CTRL_C_UF)) {
+				events |= RTC_UF;
+				num_irqs++;
+			}
+
+			rtc_update_irq(rtc->dev, num_irqs, events);
+		} else {
+			/*
+			 * One of the "extended" interrupts was received that
+			 * is not recognized by the RTC core.  These need to
+			 * be handled in task context as they can call other
+			 * functions and the time spent in irq context needs
+			 * to be minimized.  Schedule them into a workqueue
+			 * and inform the RTC core that the IRQs were handled.
+			 */
+			spin_unlock(&rtc->lock);
+			schedule_work(&rtc->work);
+			rtc_update_irq(rtc->dev, 0, 0);
+			return IRQ_HANDLED;
+		}
+	}
+	spin_unlock(&rtc->lock);
+
+	return events ? IRQ_HANDLED : IRQ_NONE;
+}
+
+/**
+ * ds1685_rtc_work_queue - work queue handler.
+ * @work: work_struct containing data to work on in task context.
+ */
+static void
+ds1685_rtc_work_queue(struct work_struct *work)
+{
+	struct ds1685_priv *rtc = container_of(work,
+					       struct ds1685_priv, work);
+	struct platform_device *pdev = to_platform_device(&rtc->dev->dev);
+	struct mutex *rtc_mutex = &rtc->dev->ops_lock;
+	u8 ctrl4a, ctrl4b;
+
+	mutex_lock(rtc_mutex);
+
+	ds1685_rtc_switch_to_bank1(rtc);
+	ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A);
+	ctrl4b = rtc->read(rtc, RTC_EXT_CTRL_4B);
+
+	/*
+	 * Check for a kickstart interrupt. With Vcc applied, this
+	 * typically means that the power button was pressed, so we
+	 * begin the shutdown sequence.
+	 */
+	if ((ctrl4b & RTC_CTRL_4B_KSE) && (ctrl4a & RTC_CTRL_4A_KF)) {
+		/* Briefly disable kickstarts to debounce button presses. */
+		rtc->write(rtc, RTC_EXT_CTRL_4B,
+			   (rtc->read(rtc, RTC_EXT_CTRL_4B) &
+			    ~(RTC_CTRL_4B_KSE)));
+
+		/* Clear the kickstart flag. */
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (ctrl4a & ~(RTC_CTRL_4A_KF)));
+
+
+		/*
+		 * Sleep 500ms before re-enabling kickstarts.  This allows
+		 * adequate time to avoid reading signal jitter as additional
+		 * button presses.
+		 */
+		msleep(500);
+		rtc->write(rtc, RTC_EXT_CTRL_4B,
+			   (rtc->read(rtc, RTC_EXT_CTRL_4B) |
+			    RTC_CTRL_4B_KSE));
+
+		/* Call the platform pre-poweroff function. Else, shutdown. */
+		if (rtc->prepare_poweroff != NULL)
+			rtc->prepare_poweroff();
+		else
+			ds1685_rtc_poweroff(pdev);
+	}
+
+	/*
+	 * Check for a wake-up interrupt.  With Vcc applied, this is
+	 * essentially a second alarm interrupt, except it takes into
+	 * account the 'date' register in bank1 in addition to the
+	 * standard three alarm registers.
+	 */
+	if ((ctrl4b & RTC_CTRL_4B_WIE) && (ctrl4a & RTC_CTRL_4A_WF)) {
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (ctrl4a & ~(RTC_CTRL_4A_WF)));
+
+		/* Call the platform wake_alarm function if defined. */
+		if (rtc->wake_alarm != NULL)
+			rtc->wake_alarm();
+		else
+			dev_warn(&pdev->dev,
+				 "Wake Alarm IRQ just occurred!\n");
+	}
+
+	/*
+	 * Check for a ram-clear interrupt.  This happens if RIE=1 and RF=0
+	 * when RCE=1 in 4B.  This clears all NVRAM bytes in bank0 by setting
+	 * each byte to a logic 1.  This has no effect on any extended
+	 * NV-SRAM that might be present, nor on the time/calendar/alarm
+	 * registers.  After a ram-clear is completed, there is a minimum
+	 * recovery time of ~150ms in which all reads/writes are locked out.
+	 * NOTE: A ram-clear can still occur if RCE=1 and RIE=0.  We cannot
+	 * catch this scenario.
+	 */
+	if ((ctrl4b & RTC_CTRL_4B_RIE) && (ctrl4a & RTC_CTRL_4A_RF)) {
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (ctrl4a & ~(RTC_CTRL_4A_RF)));
+		msleep(150);
+
+		/* Call the platform post_ram_clear function if defined. */
+		if (rtc->post_ram_clear != NULL)
+			rtc->post_ram_clear();
+		else
+			dev_warn(&pdev->dev,
+				 "RAM-Clear IRQ just occurred!\n");
+	}
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	mutex_unlock(rtc_mutex);
+}
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* ProcFS interface */
+
+#ifdef CONFIG_PROC_FS
+#define NUM_REGS	6	/* Num of control registers. */
+#define NUM_BITS	8	/* Num bits per register. */
+#define NUM_SPACES	4	/* Num spaces between each bit. */
+
+/*
+ * Periodic Interrupt Rates.
+ */
+static const char *ds1685_rtc_pirq_rate[16] = {
+	"none", "3.90625ms", "7.8125ms", "0.122070ms", "0.244141ms",
+	"0.488281ms", "0.9765625ms", "1.953125ms", "3.90625ms", "7.8125ms",
+	"15.625ms", "31.25ms", "62.5ms", "125ms", "250ms", "500ms"
+};
+
+/*
+ * Square-Wave Output Frequencies.
+ */
+static const char *ds1685_rtc_sqw_freq[16] = {
+	"none", "256Hz", "128Hz", "8192Hz", "4096Hz", "2048Hz", "1024Hz",
+	"512Hz", "256Hz", "128Hz", "64Hz", "32Hz", "16Hz", "8Hz", "4Hz", "2Hz"
+};
+
+#ifdef CONFIG_RTC_DS1685_PROC_REGS
+/**
+ * ds1685_rtc_print_regs - helper function to print register values.
+ * @hex: hex byte to convert into binary bits.
+ * @dest: destination char array.
+ *
+ * This is basically a hex->binary function, just with extra spacing between
+ * the digits.  It only works on 1-byte values (8 bits).
+ */
+static char*
+ds1685_rtc_print_regs(u8 hex, char *dest)
+{
+	u32 i, j;
+	char *tmp = dest;
+
+	for (i = 0; i < NUM_BITS; i++) {
+		*tmp++ = ((hex & 0x80) != 0 ? '1' : '0');
+		for (j = 0; j < NUM_SPACES; j++)
+			*tmp++ = ' ';
+		hex <<= 1;
+	}
+	*tmp++ = '\0';
+
+	return dest;
+}
+#endif
+
+/**
+ * ds1685_rtc_proc - procfs access function.
+ * @dev: pointer to device structure.
+ * @seq: pointer to seq_file structure.
+ */
+static int
+ds1685_rtc_proc(struct device *dev, struct seq_file *seq)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrla, ctrlb, ctrlc, ctrld, ctrl4a, ctrl4b, ssn[8];
+	char *model = '\0';
+#ifdef CONFIG_RTC_DS1685_PROC_REGS
+	char bits[NUM_REGS][(NUM_BITS * NUM_SPACES) + NUM_BITS + 1];
+#endif
+
+	/* Read all the relevant data from the control registers. */
+	ds1685_rtc_switch_to_bank1(rtc);
+	ds1685_rtc_get_ssn(rtc, ssn);
+	ctrla = rtc->read(rtc, RTC_CTRL_A);
+	ctrlb = rtc->read(rtc, RTC_CTRL_B);
+	ctrlc = rtc->read(rtc, RTC_CTRL_C);
+	ctrld = rtc->read(rtc, RTC_CTRL_D);
+	ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A);
+	ctrl4b = rtc->read(rtc, RTC_EXT_CTRL_4B);
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	/* Determine the RTC model. */
+	switch (ssn[0]) {
+	case RTC_MODEL_DS1685:
+		model = "DS1685/DS1687\0";
+		break;
+	case RTC_MODEL_DS1689:
+		model = "DS1689/DS1693\0";
+		break;
+	case RTC_MODEL_DS17285:
+		model = "DS17285/DS17287\0";
+		break;
+	case RTC_MODEL_DS17485:
+		model = "DS17485/DS17487\0";
+		break;
+	case RTC_MODEL_DS17885:
+		model = "DS17885/DS17887\0";
+		break;
+	default:
+		model = "Unknown\0";
+		break;
+	}
+
+	/* Print out the information. */
+	seq_printf(seq,
+	   "Model\t\t: %s\n"
+	   "Oscillator\t: %s\n"
+	   "12/24hr\t\t: %s\n"
+	   "DST\t\t: %s\n"
+	   "Data mode\t: %s\n"
+	   "Battery\t\t: %s\n"
+	   "Aux batt\t: %s\n"
+	   "Update IRQ\t: %s\n"
+	   "Periodic IRQ\t: %s\n"
+	   "Periodic Rate\t: %s\n"
+	   "SQW Freq\t: %s\n"
+#ifdef CONFIG_RTC_DS1685_PROC_REGS
+	   "Serial #\t: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n"
+	   "Register Status\t:\n"
+	   "   Ctrl A\t: UIP  DV2  DV1  DV0  RS3  RS2  RS1  RS0\n"
+	   "\t\t:  %s\n"
+	   "   Ctrl B\t: SET  PIE  AIE  UIE  SQWE  DM  2412 DSE\n"
+	   "\t\t:  %s\n"
+	   "   Ctrl C\t: IRQF  PF   AF   UF  ---  ---  ---  ---\n"
+	   "\t\t:  %s\n"
+	   "   Ctrl D\t: VRT  ---  ---  ---  ---  ---  ---  ---\n"
+	   "\t\t:  %s\n"
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+	   "   Ctrl 4A\t: VRT2 INCR BME  ---  PAB   RF   WF   KF\n"
+#else
+	   "   Ctrl 4A\t: VRT2 INCR ---  ---  PAB   RF   WF   KF\n"
+#endif
+	   "\t\t:  %s\n"
+	   "   Ctrl 4B\t: ABE  E32k  CS  RCE  PRS  RIE  WIE  KSE\n"
+	   "\t\t:  %s\n",
+#else
+	   "Serial #\t: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+#endif
+	   model,
+	   ((ctrla & RTC_CTRL_A_DV1) ? "enabled" : "disabled"),
+	   ((ctrlb & RTC_CTRL_B_2412) ? "24-hour" : "12-hour"),
+	   ((ctrlb & RTC_CTRL_B_DSE) ? "enabled" : "disabled"),
+	   ((ctrlb & RTC_CTRL_B_DM) ? "binary" : "BCD"),
+	   ((ctrld & RTC_CTRL_D_VRT) ? "ok" : "exhausted or n/a"),
+	   ((ctrl4a & RTC_CTRL_4A_VRT2) ? "ok" : "exhausted or n/a"),
+	   ((ctrlb & RTC_CTRL_B_UIE) ? "yes" : "no"),
+	   ((ctrlb & RTC_CTRL_B_PIE) ? "yes" : "no"),
+	   (!(ctrl4b & RTC_CTRL_4B_E32K) ?
+	    ds1685_rtc_pirq_rate[(ctrla & RTC_CTRL_A_RS_MASK)] : "none"),
+	   (!((ctrl4b & RTC_CTRL_4B_E32K)) ?
+	    ds1685_rtc_sqw_freq[(ctrla & RTC_CTRL_A_RS_MASK)] : "32768Hz"),
+#ifdef CONFIG_RTC_DS1685_PROC_REGS
+	   ssn[0], ssn[1], ssn[2], ssn[3], ssn[4], ssn[5], ssn[6], ssn[7],
+	   ds1685_rtc_print_regs(ctrla, bits[0]),
+	   ds1685_rtc_print_regs(ctrlb, bits[1]),
+	   ds1685_rtc_print_regs(ctrlc, bits[2]),
+	   ds1685_rtc_print_regs(ctrld, bits[3]),
+	   ds1685_rtc_print_regs(ctrl4a, bits[4]),
+	   ds1685_rtc_print_regs(ctrl4b, bits[5]));
+#else
+	   ssn[0], ssn[1], ssn[2], ssn[3], ssn[4], ssn[5], ssn[6], ssn[7]);
+#endif
+	return 0;
+}
+#else
+#define ds1685_rtc_proc NULL
+#endif /* CONFIG_PROC_FS */
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* RTC Class operations */
+
+static const struct rtc_class_ops
+ds1685_rtc_ops = {
+	.proc = ds1685_rtc_proc,
+	.read_time = ds1685_rtc_read_time,
+	.set_time = ds1685_rtc_set_time,
+	.read_alarm = ds1685_rtc_read_alarm,
+	.set_alarm = ds1685_rtc_set_alarm,
+	.alarm_irq_enable = ds1685_rtc_alarm_irq_enable,
+};
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* SysFS interface */
+
+#ifdef CONFIG_SYSFS
+/**
+ * ds1685_rtc_sysfs_nvram_read - reads rtc nvram via sysfs.
+ * @file: pointer to file structure.
+ * @kobj: pointer to kobject structure.
+ * @bin_attr: pointer to bin_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ * @pos: current file position pointer.
+ * @size: size of the data to read.
+ */
+static ssize_t
+ds1685_rtc_sysfs_nvram_read(struct file *filp, struct kobject *kobj,
+			    struct bin_attribute *bin_attr, char *buf,
+			    loff_t pos, size_t size)
+{
+	struct platform_device *pdev =
+		to_platform_device(container_of(kobj, struct device, kobj));
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	ssize_t count;
+	unsigned long flags = 0;
+
+	spin_lock_irqsave(&rtc->lock, flags);
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	/* Read NVRAM in time and bank0 registers. */
+	for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ_BANK0;
+	     count++, size--) {
+		if (count < NVRAM_SZ_TIME)
+			*buf++ = rtc->read(rtc, (NVRAM_TIME_BASE + pos++));
+		else
+			*buf++ = rtc->read(rtc, (NVRAM_BANK0_BASE + pos++));
+	}
+
+#ifndef CONFIG_RTC_DRV_DS1689
+	if (size > 0) {
+		ds1685_rtc_switch_to_bank1(rtc);
+
+#ifndef CONFIG_RTC_DRV_DS1685
+		/* Enable burst-mode on DS17x85/DS17x87 */
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (rtc->read(rtc, RTC_EXT_CTRL_4A) |
+			    RTC_CTRL_4A_BME));
+
+		/* We need one write to RTC_BANK1_RAM_ADDR_LSB to start
+		 * reading with burst-mode */
+		rtc->write(rtc, RTC_BANK1_RAM_ADDR_LSB,
+			   (pos - NVRAM_TOTAL_SZ_BANK0));
+#endif
+
+		/* Read NVRAM in bank1 registers. */
+		for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ;
+		     count++, size--) {
+#ifdef CONFIG_RTC_DRV_DS1685
+			/* DS1685/DS1687 has to write to RTC_BANK1_RAM_ADDR
+			 * before each read. */
+			rtc->write(rtc, RTC_BANK1_RAM_ADDR,
+				   (pos - NVRAM_TOTAL_SZ_BANK0));
+#endif
+			*buf++ = rtc->read(rtc, RTC_BANK1_RAM_DATA_PORT);
+			pos++;
+		}
+
+#ifndef CONFIG_RTC_DRV_DS1685
+		/* Disable burst-mode on DS17x85/DS17x87 */
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (rtc->read(rtc, RTC_EXT_CTRL_4A) &
+			    ~(RTC_CTRL_4A_BME)));
+#endif
+		ds1685_rtc_switch_to_bank0(rtc);
+	}
+#endif /* !CONFIG_RTC_DRV_DS1689 */
+	spin_unlock_irqrestore(&rtc->lock, flags);
+
+	/*
+	 * XXX: Bug? this appears to cause the function to get executed
+	 * several times in succession.  But it's the only way to actually get
+	 * data written out to a file.
+	 */
+	return count;
+}
+
+/**
+ * ds1685_rtc_sysfs_nvram_write - writes rtc nvram via sysfs.
+ * @file: pointer to file structure.
+ * @kobj: pointer to kobject structure.
+ * @bin_attr: pointer to bin_attribute structure.
+ * @buf: pointer to char array to hold the input.
+ * @pos: current file position pointer.
+ * @size: size of the data to write.
+ */
+static ssize_t
+ds1685_rtc_sysfs_nvram_write(struct file *filp, struct kobject *kobj,
+			     struct bin_attribute *bin_attr, char *buf,
+			     loff_t pos, size_t size)
+{
+	struct platform_device *pdev =
+		to_platform_device(container_of(kobj, struct device, kobj));
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	ssize_t count;
+	unsigned long flags = 0;
+
+	spin_lock_irqsave(&rtc->lock, flags);
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	/* Write NVRAM in time and bank0 registers. */
+	for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ_BANK0;
+	     count++, size--)
+		if (count < NVRAM_SZ_TIME)
+			rtc->write(rtc, (NVRAM_TIME_BASE + pos++),
+				   *buf++);
+		else
+			rtc->write(rtc, (NVRAM_BANK0_BASE), *buf++);
+
+#ifndef CONFIG_RTC_DRV_DS1689
+	if (size > 0) {
+		ds1685_rtc_switch_to_bank1(rtc);
+
+#ifndef CONFIG_RTC_DRV_DS1685
+		/* Enable burst-mode on DS17x85/DS17x87 */
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (rtc->read(rtc, RTC_EXT_CTRL_4A) |
+			    RTC_CTRL_4A_BME));
+
+		/* We need one write to RTC_BANK1_RAM_ADDR_LSB to start
+		 * writing with burst-mode */
+		rtc->write(rtc, RTC_BANK1_RAM_ADDR_LSB,
+			   (pos - NVRAM_TOTAL_SZ_BANK0));
+#endif
+
+		/* Write NVRAM in bank1 registers. */
+		for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ;
+		     count++, size--) {
+#ifdef CONFIG_RTC_DRV_DS1685
+			/* DS1685/DS1687 has to write to RTC_BANK1_RAM_ADDR
+			 * before each read. */
+			rtc->write(rtc, RTC_BANK1_RAM_ADDR,
+				   (pos - NVRAM_TOTAL_SZ_BANK0));
+#endif
+			rtc->write(rtc, RTC_BANK1_RAM_DATA_PORT, *buf++);
+			pos++;
+		}
+
+#ifndef CONFIG_RTC_DRV_DS1685
+		/* Disable burst-mode on DS17x85/DS17x87 */
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (rtc->read(rtc, RTC_EXT_CTRL_4A) &
+			    ~(RTC_CTRL_4A_BME)));
+#endif
+		ds1685_rtc_switch_to_bank0(rtc);
+	}
+#endif /* !CONFIG_RTC_DRV_DS1689 */
+	spin_unlock_irqrestore(&rtc->lock, flags);
+
+	return count;
+}
+
+/**
+ * struct ds1685_rtc_sysfs_nvram_attr - sysfs attributes for rtc nvram.
+ * @attr: nvram attributes.
+ * @read: nvram read function.
+ * @write: nvram write function.
+ * @size: nvram total size (bank0 + extended).
+ */
+static struct bin_attribute
+ds1685_rtc_sysfs_nvram_attr = {
+	.attr = {
+		.name = "nvram",
+		.mode = S_IRUGO | S_IWUSR,
+	},
+	.read = ds1685_rtc_sysfs_nvram_read,
+	.write = ds1685_rtc_sysfs_nvram_write,
+	.size = NVRAM_TOTAL_SZ
+};
+
+/**
+ * ds1685_rtc_sysfs_battery_show - sysfs file for main battery status.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_battery_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrld;
+
+	ctrld = rtc->read(rtc, RTC_CTRL_D);
+
+	return snprintf(buf, 13, "%s\n",
+			(ctrld & RTC_CTRL_D_VRT) ? "ok" : "not ok or N/A");
+}
+static DEVICE_ATTR(battery, S_IRUGO, ds1685_rtc_sysfs_battery_show, NULL);
+
+/**
+ * ds1685_rtc_sysfs_auxbatt_show - sysfs file for aux battery status.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_auxbatt_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrl4a;
+
+	ds1685_rtc_switch_to_bank1(rtc);
+	ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A);
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	return snprintf(buf, 13, "%s\n",
+			(ctrl4a & RTC_CTRL_4A_VRT2) ? "ok" : "not ok or N/A");
+}
+static DEVICE_ATTR(auxbatt, S_IRUGO, ds1685_rtc_sysfs_auxbatt_show, NULL);
+
+/**
+ * ds1685_rtc_sysfs_serial_show - sysfs file for silicon serial number.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_serial_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ssn[8];
+
+	ds1685_rtc_switch_to_bank1(rtc);
+	ds1685_rtc_get_ssn(rtc, ssn);
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	return snprintf(buf, 24, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+			ssn[0], ssn[1], ssn[2], ssn[3], ssn[4], ssn[5],
+			ssn[6], ssn[7]);
+
+	return 0;
+}
+static DEVICE_ATTR(serial, S_IRUGO, ds1685_rtc_sysfs_serial_show, NULL);
+
+/**
+ * struct ds1685_rtc_sysfs_misc_attrs - list for misc RTC features.
+ */
+static struct attribute*
+ds1685_rtc_sysfs_misc_attrs[] = {
+	&dev_attr_battery.attr,
+	&dev_attr_auxbatt.attr,
+	&dev_attr_serial.attr,
+	NULL,
+};
+
+/**
+ * struct ds1685_rtc_sysfs_misc_grp - attr group for misc RTC features.
+ */
+static const struct attribute_group
+ds1685_rtc_sysfs_misc_grp = {
+	.name = "misc",
+	.attrs = ds1685_rtc_sysfs_misc_attrs,
+};
+
+#ifdef CONFIG_RTC_DS1685_SYSFS_REGS
+/**
+ * struct ds1685_rtc_ctrl_regs.
+ * @name: char pointer for the bit name.
+ * @reg: control register the bit is in.
+ * @bit: the bit's offset in the register.
+ */
+struct ds1685_rtc_ctrl_regs {
+	const char *name;
+	const u8 reg;
+	const u8 bit;
+};
+
+/*
+ * Ctrl register bit lookup table.
+ */
+static const struct ds1685_rtc_ctrl_regs
+ds1685_ctrl_regs_table[] = {
+	{ "uip",  RTC_CTRL_A,      RTC_CTRL_A_UIP   },
+	{ "dv2",  RTC_CTRL_A,      RTC_CTRL_A_DV2   },
+	{ "dv1",  RTC_CTRL_A,      RTC_CTRL_A_DV1   },
+	{ "dv0",  RTC_CTRL_A,      RTC_CTRL_A_DV0   },
+	{ "rs3",  RTC_CTRL_A,      RTC_CTRL_A_RS3   },
+	{ "rs2",  RTC_CTRL_A,      RTC_CTRL_A_RS2   },
+	{ "rs1",  RTC_CTRL_A,      RTC_CTRL_A_RS1   },
+	{ "rs0",  RTC_CTRL_A,      RTC_CTRL_A_RS0   },
+	{ "set",  RTC_CTRL_B,      RTC_CTRL_B_SET   },
+	{ "pie",  RTC_CTRL_B,      RTC_CTRL_B_PIE   },
+	{ "aie",  RTC_CTRL_B,      RTC_CTRL_B_AIE   },
+	{ "uie",  RTC_CTRL_B,      RTC_CTRL_B_UIE   },
+	{ "sqwe", RTC_CTRL_B,      RTC_CTRL_B_SQWE  },
+	{ "dm",   RTC_CTRL_B,      RTC_CTRL_B_DM    },
+	{ "2412", RTC_CTRL_B,      RTC_CTRL_B_2412  },
+	{ "dse",  RTC_CTRL_B,      RTC_CTRL_B_DSE   },
+	{ "irqf", RTC_CTRL_C,      RTC_CTRL_C_IRQF  },
+	{ "pf",   RTC_CTRL_C,      RTC_CTRL_C_PF    },
+	{ "af",   RTC_CTRL_C,      RTC_CTRL_C_AF    },
+	{ "uf",   RTC_CTRL_C,      RTC_CTRL_C_UF    },
+	{ "vrt",  RTC_CTRL_D,      RTC_CTRL_D_VRT   },
+	{ "vrt2", RTC_EXT_CTRL_4A, RTC_CTRL_4A_VRT2 },
+	{ "incr", RTC_EXT_CTRL_4A, RTC_CTRL_4A_INCR },
+	{ "pab",  RTC_EXT_CTRL_4A, RTC_CTRL_4A_PAB  },
+	{ "rf",   RTC_EXT_CTRL_4A, RTC_CTRL_4A_RF   },
+	{ "wf",   RTC_EXT_CTRL_4A, RTC_CTRL_4A_WF   },
+	{ "kf",   RTC_EXT_CTRL_4A, RTC_CTRL_4A_KF   },
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+	{ "bme",  RTC_EXT_CTRL_4A, RTC_CTRL_4A_BME  },
+#endif
+	{ "abe",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_ABE  },
+	{ "e32k", RTC_EXT_CTRL_4B, RTC_CTRL_4B_E32K },
+	{ "cs",   RTC_EXT_CTRL_4B, RTC_CTRL_4B_CS   },
+	{ "rce",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_RCE  },
+	{ "prs",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_PRS  },
+	{ "rie",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_RIE  },
+	{ "wie",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_WIE  },
+	{ "kse",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_KSE  },
+	{ NULL,   0,               0                },
+};
+
+/**
+ * ds1685_rtc_sysfs_ctrl_regs_lookup - ctrl register bit lookup function.
+ * @name: ctrl register bit to look up in ds1685_ctrl_regs_table.
+ */
+static const struct ds1685_rtc_ctrl_regs*
+ds1685_rtc_sysfs_ctrl_regs_lookup(const char *name)
+{
+	const struct ds1685_rtc_ctrl_regs *p = ds1685_ctrl_regs_table;
+
+	for (; p->name != NULL; ++p)
+		if (strcmp(p->name, name) == 0)
+			return p;
+
+	return NULL;
+}
+
+/**
+ * ds1685_rtc_sysfs_ctrl_regs_show - reads a ctrl register bit via sysfs.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_ctrl_regs_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	u8 tmp;
+	struct ds1685_priv *rtc = dev_get_drvdata(dev);
+	const struct ds1685_rtc_ctrl_regs *reg_info =
+		ds1685_rtc_sysfs_ctrl_regs_lookup(attr->attr.name);
+
+	/* Make sure we actually matched something. */
+	if (!reg_info)
+		return -EINVAL;
+
+	/* No spinlock during a read -- mutex is already held. */
+	ds1685_rtc_switch_to_bank1(rtc);
+	tmp = rtc->read(rtc, reg_info->reg) & reg_info->bit;
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	return snprintf(buf, 2, "%d\n", (tmp ? 1 : 0));
+}
+
+/**
+ * ds1685_rtc_sysfs_ctrl_regs_store - writes a ctrl register bit via sysfs.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ * @count: number of bytes written.
+ */
+static ssize_t
+ds1685_rtc_sysfs_ctrl_regs_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct ds1685_priv *rtc = dev_get_drvdata(dev);
+	u8 reg = 0, bit = 0, tmp;
+	unsigned long flags = 0;
+	long int val = 0;
+	const struct ds1685_rtc_ctrl_regs *reg_info =
+		ds1685_rtc_sysfs_ctrl_regs_lookup(attr->attr.name);
+
+	/* We only accept numbers. */
+	if (kstrtol(buf, 10, &val) < 0)
+		return -EINVAL;
+
+	/* bits are binary, 0 or 1 only. */
+	if ((val != 0) && (val != 1))
+		return -ERANGE;
+
+	/* Make sure we actually matched something. */
+	if (!reg_info)
+		return -EINVAL;
+
+	reg = reg_info->reg;
+	bit = reg_info->bit;
+
+	/* Safe to spinlock during a write. */
+	ds1685_rtc_begin_ctrl_access(rtc, flags);
+	tmp = rtc->read(rtc, reg);
+	rtc->write(rtc, reg, (val ? (tmp | bit) : (tmp & ~(bit))));
+	ds1685_rtc_end_ctrl_access(rtc, flags);
+
+	return count;
+}
+
+/**
+ * DS1685_RTC_SYSFS_CTRL_REG_RO - device_attribute for read-only register bit.
+ * @bit: bit to read.
+ */
+#define DS1685_RTC_SYSFS_CTRL_REG_RO(bit)				\
+	static DEVICE_ATTR(bit, S_IRUGO,				\
+	ds1685_rtc_sysfs_ctrl_regs_show, NULL)
+
+/**
+ * DS1685_RTC_SYSFS_CTRL_REG_RW - device_attribute for read-write register bit.
+ * @bit: bit to read or write.
+ */
+#define DS1685_RTC_SYSFS_CTRL_REG_RW(bit)				\
+	static DEVICE_ATTR(bit, S_IRUGO | S_IWUSR,			\
+	ds1685_rtc_sysfs_ctrl_regs_show,				\
+	ds1685_rtc_sysfs_ctrl_regs_store)
+
+/*
+ * Control Register A bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(uip);
+DS1685_RTC_SYSFS_CTRL_REG_RW(dv2);
+DS1685_RTC_SYSFS_CTRL_REG_RW(dv1);
+DS1685_RTC_SYSFS_CTRL_REG_RO(dv0);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rs3);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rs2);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rs1);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rs0);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrla_attrs[] = {
+	&dev_attr_uip.attr,
+	&dev_attr_dv2.attr,
+	&dev_attr_dv1.attr,
+	&dev_attr_dv0.attr,
+	&dev_attr_rs3.attr,
+	&dev_attr_rs2.attr,
+	&dev_attr_rs1.attr,
+	&dev_attr_rs0.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrla_grp = {
+	.name = "ctrla",
+	.attrs = ds1685_rtc_sysfs_ctrla_attrs,
+};
+
+
+/*
+ * Control Register B bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(set);
+DS1685_RTC_SYSFS_CTRL_REG_RW(pie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(aie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(uie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(sqwe);
+DS1685_RTC_SYSFS_CTRL_REG_RO(dm);
+DS1685_RTC_SYSFS_CTRL_REG_RO(2412);
+DS1685_RTC_SYSFS_CTRL_REG_RO(dse);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrlb_attrs[] = {
+	&dev_attr_set.attr,
+	&dev_attr_pie.attr,
+	&dev_attr_aie.attr,
+	&dev_attr_uie.attr,
+	&dev_attr_sqwe.attr,
+	&dev_attr_dm.attr,
+	&dev_attr_2412.attr,
+	&dev_attr_dse.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrlb_grp = {
+	.name = "ctrlb",
+	.attrs = ds1685_rtc_sysfs_ctrlb_attrs,
+};
+
+/*
+ * Control Register C bits.
+ *
+ * Reading Control C clears these bits!  Reading them individually can
+ * possibly cause an interrupt to be missed.  Use the /proc interface
+ * to see all the bits in this register simultaneously.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(irqf);
+DS1685_RTC_SYSFS_CTRL_REG_RO(pf);
+DS1685_RTC_SYSFS_CTRL_REG_RO(af);
+DS1685_RTC_SYSFS_CTRL_REG_RO(uf);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrlc_attrs[] = {
+	&dev_attr_irqf.attr,
+	&dev_attr_pf.attr,
+	&dev_attr_af.attr,
+	&dev_attr_uf.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrlc_grp = {
+	.name = "ctrlc",
+	.attrs = ds1685_rtc_sysfs_ctrlc_attrs,
+};
+
+/*
+ * Control Register D bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(vrt);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrld_attrs[] = {
+	&dev_attr_vrt.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrld_grp = {
+	.name = "ctrld",
+	.attrs = ds1685_rtc_sysfs_ctrld_attrs,
+};
+
+/*
+ * Control Register 4A bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(vrt2);
+DS1685_RTC_SYSFS_CTRL_REG_RO(incr);
+DS1685_RTC_SYSFS_CTRL_REG_RW(pab);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rf);
+DS1685_RTC_SYSFS_CTRL_REG_RW(wf);
+DS1685_RTC_SYSFS_CTRL_REG_RW(kf);
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+DS1685_RTC_SYSFS_CTRL_REG_RO(bme);
+#endif
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrl4a_attrs[] = {
+	&dev_attr_vrt2.attr,
+	&dev_attr_incr.attr,
+	&dev_attr_pab.attr,
+	&dev_attr_rf.attr,
+	&dev_attr_wf.attr,
+	&dev_attr_kf.attr,
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+	&dev_attr_bme.attr,
+#endif
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrl4a_grp = {
+	.name = "ctrl4a",
+	.attrs = ds1685_rtc_sysfs_ctrl4a_attrs,
+};
+
+/*
+ * Control Register 4B bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RW(abe);
+DS1685_RTC_SYSFS_CTRL_REG_RW(e32k);
+DS1685_RTC_SYSFS_CTRL_REG_RO(cs);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rce);
+DS1685_RTC_SYSFS_CTRL_REG_RW(prs);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(wie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(kse);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrl4b_attrs[] = {
+	&dev_attr_abe.attr,
+	&dev_attr_e32k.attr,
+	&dev_attr_cs.attr,
+	&dev_attr_rce.attr,
+	&dev_attr_prs.attr,
+	&dev_attr_rie.attr,
+	&dev_attr_wie.attr,
+	&dev_attr_kse.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrl4b_grp = {
+	.name = "ctrl4b",
+	.attrs = ds1685_rtc_sysfs_ctrl4b_attrs,
+};
+
+
+/**
+ * struct ds1685_rtc_ctrl_regs.
+ * @name: char pointer for the bit name.
+ * @reg: control register the bit is in.
+ * @bit: the bit's offset in the register.
+ */
+struct ds1685_rtc_time_regs {
+	const char *name;
+	const u8 reg;
+	const u8 mask;
+	const u8 min;
+	const u8 max;
+};
+
+/*
+ * Time/Date register lookup tables.
+ */
+static const struct ds1685_rtc_time_regs
+ds1685_time_regs_bcd_table[] = {
+	{ "seconds",       RTC_SECS,       RTC_SECS_BCD_MASK,   0, 59 },
+	{ "minutes",       RTC_MINS,       RTC_MINS_BCD_MASK,   0, 59 },
+	{ "hours",         RTC_HRS,        RTC_HRS_24_BCD_MASK, 0, 23 },
+	{ "wday",          RTC_WDAY,       RTC_WDAY_MASK,       1,  7 },
+	{ "mday",          RTC_MDAY,       RTC_MDAY_BCD_MASK,   1, 31 },
+	{ "month",         RTC_MONTH,      RTC_MONTH_BCD_MASK,  1, 12 },
+	{ "year",          RTC_YEAR,       RTC_YEAR_BCD_MASK,   0, 99 },
+	{ "century",       RTC_CENTURY,    RTC_CENTURY_MASK,    0, 99 },
+	{ "alarm_seconds", RTC_SECS_ALARM, RTC_SECS_BCD_MASK,   0, 59 },
+	{ "alarm_minutes", RTC_MINS_ALARM, RTC_MINS_BCD_MASK,   0, 59 },
+	{ "alarm_hours",   RTC_HRS_ALARM,  RTC_HRS_24_BCD_MASK, 0, 23 },
+	{ "alarm_mday",    RTC_MDAY_ALARM, RTC_MDAY_ALARM_MASK, 1, 31 },
+	{ NULL,            0,              0,                   0,  0 },
+};
+
+static const struct ds1685_rtc_time_regs
+ds1685_time_regs_bin_table[] = {
+	{ "seconds",       RTC_SECS,       RTC_SECS_BIN_MASK,   0x00, 0x3b },
+	{ "minutes",       RTC_MINS,       RTC_MINS_BIN_MASK,   0x00, 0x3b },
+	{ "hours",         RTC_HRS,        RTC_HRS_24_BIN_MASK, 0x00, 0x17 },
+	{ "wday",          RTC_WDAY,       RTC_WDAY_MASK,       0x01, 0x07 },
+	{ "mday",          RTC_MDAY,       RTC_MDAY_BIN_MASK,   0x01, 0x1f },
+	{ "month",         RTC_MONTH,      RTC_MONTH_BIN_MASK,  0x01, 0x0c },
+	{ "year",          RTC_YEAR,       RTC_YEAR_BIN_MASK,   0x00, 0x63 },
+	{ "century",       RTC_CENTURY,    RTC_CENTURY_MASK,    0x00, 0x63 },
+	{ "alarm_seconds", RTC_SECS_ALARM, RTC_SECS_BIN_MASK,   0x00, 0x3b },
+	{ "alarm_minutes", RTC_MINS_ALARM, RTC_MINS_BIN_MASK,   0x00, 0x3b },
+	{ "alarm_hours",   RTC_HRS_ALARM,  RTC_HRS_24_BIN_MASK, 0x00, 0x17 },
+	{ "alarm_mday",    RTC_MDAY_ALARM, RTC_MDAY_ALARM_MASK, 0x01, 0x1f },
+	{ NULL,            0,              0,                   0x00, 0x00 },
+};
+
+/**
+ * ds1685_rtc_sysfs_time_regs_bcd_lookup - time/date reg bit lookup function.
+ * @name: register bit to look up in ds1685_time_regs_bcd_table.
+ */
+static const struct ds1685_rtc_time_regs*
+ds1685_rtc_sysfs_time_regs_lookup(const char *name, bool bcd_mode)
+{
+	const struct ds1685_rtc_time_regs *p;
+
+	if (bcd_mode)
+		p = ds1685_time_regs_bcd_table;
+	else
+		p = ds1685_time_regs_bin_table;
+
+	for (; p->name != NULL; ++p)
+		if (strcmp(p->name, name) == 0)
+			return p;
+
+	return NULL;
+}
+
+/**
+ * ds1685_rtc_sysfs_time_regs_show - reads a time/date register via sysfs.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_time_regs_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	u8 tmp;
+	struct ds1685_priv *rtc = dev_get_drvdata(dev);
+	const struct ds1685_rtc_time_regs *bcd_reg_info =
+		ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, true);
+	const struct ds1685_rtc_time_regs *bin_reg_info =
+		ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, false);
+
+	/* Make sure we actually matched something. */
+	if (!bcd_reg_info && !bin_reg_info)
+		return -EINVAL;
+
+	/* bcd_reg_info->reg == bin_reg_info->reg. */
+	ds1685_rtc_begin_data_access(rtc);
+	tmp = rtc->read(rtc, bcd_reg_info->reg);
+	ds1685_rtc_end_data_access(rtc);
+
+	tmp = ds1685_rtc_bcd2bin(rtc, tmp, bcd_reg_info->mask,
+				 bin_reg_info->mask);
+
+	return snprintf(buf, 4, "%d\n", tmp);
+}
+
+/**
+ * ds1685_rtc_sysfs_time_regs_store - writes a time/date register via sysfs.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ * @count: number of bytes written.
+ */
+static ssize_t
+ds1685_rtc_sysfs_time_regs_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	long int val = 0;
+	struct ds1685_priv *rtc = dev_get_drvdata(dev);
+	const struct ds1685_rtc_time_regs *bcd_reg_info =
+		ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, true);
+	const struct ds1685_rtc_time_regs *bin_reg_info =
+		ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, false);
+
+	/* We only accept numbers. */
+	if (kstrtol(buf, 10, &val) < 0)
+		return -EINVAL;
+
+	/* Make sure we actually matched something. */
+	if (!bcd_reg_info && !bin_reg_info)
+		return -EINVAL;
+
+	/* Check for a valid range. */
+	if (rtc->bcd_mode) {
+		if ((val < bcd_reg_info->min) || (val > bcd_reg_info->max))
+			return -ERANGE;
+	} else {
+		if ((val < bin_reg_info->min) || (val > bin_reg_info->max))
+			return -ERANGE;
+	}
+
+	val = ds1685_rtc_bin2bcd(rtc, val, bin_reg_info->mask,
+				 bcd_reg_info->mask);
+
+	/* bcd_reg_info->reg == bin_reg_info->reg. */
+	ds1685_rtc_begin_data_access(rtc);
+	rtc->write(rtc, bcd_reg_info->reg, val);
+	ds1685_rtc_end_data_access(rtc);
+
+	return count;
+}
+
+/**
+ * DS1685_RTC_SYSFS_REG_RW - device_attribute for a read-write time register.
+ * @reg: time/date register to read or write.
+ */
+#define DS1685_RTC_SYSFS_TIME_REG_RW(reg)				\
+	static DEVICE_ATTR(reg, S_IRUGO | S_IWUSR,			\
+	ds1685_rtc_sysfs_time_regs_show,				\
+	ds1685_rtc_sysfs_time_regs_store)
+
+/*
+ * Time/Date Register bits.
+ */
+DS1685_RTC_SYSFS_TIME_REG_RW(seconds);
+DS1685_RTC_SYSFS_TIME_REG_RW(minutes);
+DS1685_RTC_SYSFS_TIME_REG_RW(hours);
+DS1685_RTC_SYSFS_TIME_REG_RW(wday);
+DS1685_RTC_SYSFS_TIME_REG_RW(mday);
+DS1685_RTC_SYSFS_TIME_REG_RW(month);
+DS1685_RTC_SYSFS_TIME_REG_RW(year);
+DS1685_RTC_SYSFS_TIME_REG_RW(century);
+DS1685_RTC_SYSFS_TIME_REG_RW(alarm_seconds);
+DS1685_RTC_SYSFS_TIME_REG_RW(alarm_minutes);
+DS1685_RTC_SYSFS_TIME_REG_RW(alarm_hours);
+DS1685_RTC_SYSFS_TIME_REG_RW(alarm_mday);
+
+static struct attribute*
+ds1685_rtc_sysfs_time_attrs[] = {
+	&dev_attr_seconds.attr,
+	&dev_attr_minutes.attr,
+	&dev_attr_hours.attr,
+	&dev_attr_wday.attr,
+	&dev_attr_mday.attr,
+	&dev_attr_month.attr,
+	&dev_attr_year.attr,
+	&dev_attr_century.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_time_grp = {
+	.name = "datetime",
+	.attrs = ds1685_rtc_sysfs_time_attrs,
+};
+
+static struct attribute*
+ds1685_rtc_sysfs_alarm_attrs[] = {
+	&dev_attr_alarm_seconds.attr,
+	&dev_attr_alarm_minutes.attr,
+	&dev_attr_alarm_hours.attr,
+	&dev_attr_alarm_mday.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_alarm_grp = {
+	.name = "alarm",
+	.attrs = ds1685_rtc_sysfs_alarm_attrs,
+};
+#endif /* CONFIG_RTC_DS1685_SYSFS_REGS */
+
+
+/**
+ * ds1685_rtc_sysfs_register - register sysfs files.
+ * @dev: pointer to device structure.
+ */
+static int
+ds1685_rtc_sysfs_register(struct device *dev)
+{
+	int ret = 0;
+
+	sysfs_bin_attr_init(&ds1685_rtc_sysfs_nvram_attr);
+	ret = sysfs_create_bin_file(&dev->kobj, &ds1685_rtc_sysfs_nvram_attr);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_misc_grp);
+	if (ret)
+		return ret;
+
+#ifdef CONFIG_RTC_DS1685_SYSFS_REGS
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrla_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlb_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlc_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrld_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4a_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4b_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_time_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_alarm_grp);
+	if (ret)
+		return ret;
+#endif
+	return 0;
+}
+
+/**
+ * ds1685_rtc_sysfs_unregister - unregister sysfs files.
+ * @dev: pointer to device structure.
+ */
+static int
+ds1685_rtc_sysfs_unregister(struct device *dev)
+{
+	sysfs_remove_bin_file(&dev->kobj, &ds1685_rtc_sysfs_nvram_attr);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_misc_grp);
+
+#ifdef CONFIG_RTC_DS1685_SYSFS_REGS
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrla_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlb_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlc_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrld_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4a_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4b_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_time_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_alarm_grp);
+#endif
+
+	return 0;
+}
+#endif /* CONFIG_SYSFS */
+
+
+
+/* ----------------------------------------------------------------------- */
+/* Driver Probe/Removal */
+
+/**
+ * ds1685_rtc_probe - initializes rtc driver.
+ * @pdev: pointer to platform_device structure.
+ */
+static int
+ds1685_rtc_probe(struct platform_device *pdev)
+{
+	struct rtc_device *rtc_dev;
+	struct resource *res;
+	struct ds1685_priv *rtc;
+	struct ds1685_rtc_platform_data *pdata;
+	u8 ctrla, ctrlb, hours;
+	unsigned char am_pm;
+	int ret = 0;
+
+	/* Get the platform data. */
+	pdata = (struct ds1685_rtc_platform_data *) pdev->dev.platform_data;
+	if (!pdata)
+		return -ENODEV;
+
+	/* Allocate memory for the rtc device. */
+	rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL);
+	if (!rtc)
+		return -ENOMEM;
+
+	/*
+	 * Allocate/setup any IORESOURCE_MEM resources, if required.  Not all
+	 * platforms put the RTC in an easy-access place.  Like the SGI Octane,
+	 * which attaches the RTC to a "ByteBus", hooked to a SuperIO chip
+	 * that sits behind the IOC3 PCI metadevice.
+	 */
+	if (pdata->alloc_io_resources) {
+		/* Get the platform resources. */
+		res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+		if (!res)
+			return -ENXIO;
+		rtc->size = resource_size(res);
+
+		/* Request a memory region. */
+		/* XXX: mmio-only for now. */
+		if (!devm_request_mem_region(&pdev->dev, res->start, rtc->size,
+					     pdev->name))
+			return -EBUSY;
+
+		/*
+		 * Set the base address for the rtc, and ioremap its
+		 * registers.
+		 */
+		rtc->baseaddr = res->start;
+		rtc->regs = devm_ioremap(&pdev->dev, res->start, rtc->size);
+		if (!rtc->regs)
+			return -ENOMEM;
+	}
+	rtc->alloc_io_resources = pdata->alloc_io_resources;
+
+	/* Get the register step size. */
+	if (pdata->regstep > 0)
+		rtc->regstep = pdata->regstep;
+	else
+		rtc->regstep = 1;
+
+	/* Platform read function, else default if mmio setup */
+	if (pdata->plat_read)
+		rtc->read = pdata->plat_read;
+	else
+		if (pdata->alloc_io_resources)
+			rtc->read = ds1685_read;
+		else
+			return -ENXIO;
+
+	/* Platform write function, else default if mmio setup */
+	if (pdata->plat_write)
+		rtc->write = pdata->plat_write;
+	else
+		if (pdata->alloc_io_resources)
+			rtc->write = ds1685_write;
+		else
+			return -ENXIO;
+
+	/* Platform pre-shutdown function, if defined. */
+	if (pdata->plat_prepare_poweroff)
+		rtc->prepare_poweroff = pdata->plat_prepare_poweroff;
+
+	/* Platform wake_alarm function, if defined. */
+	if (pdata->plat_wake_alarm)
+		rtc->wake_alarm = pdata->plat_wake_alarm;
+
+	/* Platform post_ram_clear function, if defined. */
+	if (pdata->plat_post_ram_clear)
+		rtc->post_ram_clear = pdata->plat_post_ram_clear;
+
+	/* Init the spinlock, workqueue, & set the driver data. */
+	spin_lock_init(&rtc->lock);
+	INIT_WORK(&rtc->work, ds1685_rtc_work_queue);
+	platform_set_drvdata(pdev, rtc);
+
+	/* Turn the oscillator on if is not already on (DV1 = 1). */
+	ctrla = rtc->read(rtc, RTC_CTRL_A);
+	if (!(ctrla & RTC_CTRL_A_DV1))
+		ctrla |= RTC_CTRL_A_DV1;
+
+	/* Enable the countdown chain (DV2 = 0) */
+	ctrla &= ~(RTC_CTRL_A_DV2);
+
+	/* Clear RS3-RS0 in Control A. */
+	ctrla &= ~(RTC_CTRL_A_RS_MASK);
+
+	/*
+	 * All done with Control A.  Switch to Bank 1 for the remainder of
+	 * the RTC setup so we have access to the extended functions.
+	 */
+	ctrla |= RTC_CTRL_A_DV0;
+	rtc->write(rtc, RTC_CTRL_A, ctrla);
+
+	/* Default to 32768kHz output. */
+	rtc->write(rtc, RTC_EXT_CTRL_4B,
+		   (rtc->read(rtc, RTC_EXT_CTRL_4B) | RTC_CTRL_4B_E32K));
+
+	/* Set the SET bit in Control B so we can do some housekeeping. */
+	rtc->write(rtc, RTC_CTRL_B,
+		   (rtc->read(rtc, RTC_CTRL_B) | RTC_CTRL_B_SET));
+
+	/* Read Ext Ctrl 4A and check the INCR bit to avoid a lockout. */
+	while (rtc->read(rtc, RTC_EXT_CTRL_4A) & RTC_CTRL_4A_INCR)
+		cpu_relax();
+
+	/*
+	 * If the platform supports BCD mode, then set DM=0 in Control B.
+	 * Otherwise, set DM=1 for BIN mode.
+	 */
+	ctrlb = rtc->read(rtc, RTC_CTRL_B);
+	if (pdata->bcd_mode)
+		ctrlb &= ~(RTC_CTRL_B_DM);
+	else
+		ctrlb |= RTC_CTRL_B_DM;
+	rtc->bcd_mode = pdata->bcd_mode;
+
+	/*
+	 * Disable Daylight Savings Time (DSE = 0).
+	 * The RTC has hardcoded timezone information that is rendered
+	 * obselete.  We'll let the OS deal with DST settings instead.
+	 */
+	if (ctrlb & RTC_CTRL_B_DSE)
+		ctrlb &= ~(RTC_CTRL_B_DSE);
+
+	/* Force 24-hour mode (2412 = 1). */
+	if (!(ctrlb & RTC_CTRL_B_2412)) {
+		/* Reinitialize the time hours. */
+		hours = rtc->read(rtc, RTC_HRS);
+		am_pm = hours & RTC_HRS_AMPM_MASK;
+		hours = ds1685_rtc_bcd2bin(rtc, hours, RTC_HRS_12_BCD_MASK,
+					   RTC_HRS_12_BIN_MASK);
+		hours = ((hours == 12) ? 0 : ((am_pm) ? hours + 12 : hours));
+
+		/* Enable 24-hour mode. */
+		ctrlb |= RTC_CTRL_B_2412;
+
+		/* Write back to Control B, including DM & DSE bits. */
+		rtc->write(rtc, RTC_CTRL_B, ctrlb);
+
+		/* Write the time hours back. */
+		rtc->write(rtc, RTC_HRS,
+			   ds1685_rtc_bin2bcd(rtc, hours,
+					      RTC_HRS_24_BIN_MASK,
+					      RTC_HRS_24_BCD_MASK));
+
+		/* Reinitialize the alarm hours. */
+		hours = rtc->read(rtc, RTC_HRS_ALARM);
+		am_pm = hours & RTC_HRS_AMPM_MASK;
+		hours = ds1685_rtc_bcd2bin(rtc, hours, RTC_HRS_12_BCD_MASK,
+					   RTC_HRS_12_BIN_MASK);
+		hours = ((hours == 12) ? 0 : ((am_pm) ? hours + 12 : hours));
+
+		/* Write the alarm hours back. */
+		rtc->write(rtc, RTC_HRS_ALARM,
+			   ds1685_rtc_bin2bcd(rtc, hours,
+					      RTC_HRS_24_BIN_MASK,
+					      RTC_HRS_24_BCD_MASK));
+	} else {
+		/* 24-hour mode is already set, so write Control B back. */
+		rtc->write(rtc, RTC_CTRL_B, ctrlb);
+	}
+
+	/* Unset the SET bit in Control B so the RTC can update. */
+	rtc->write(rtc, RTC_CTRL_B,
+		   (rtc->read(rtc, RTC_CTRL_B) & ~(RTC_CTRL_B_SET)));
+
+	/* Check the main battery. */
+	if (!(rtc->read(rtc, RTC_CTRL_D) & RTC_CTRL_D_VRT))
+		dev_warn(&pdev->dev,
+			 "Main battery is exhausted! RTC may be invalid!\n");
+
+	/* Check the auxillary battery.  It is optional. */
+	if (!(rtc->read(rtc, RTC_EXT_CTRL_4A) & RTC_CTRL_4A_VRT2))
+		dev_warn(&pdev->dev,
+			 "Aux battery is exhausted or not available.\n");
+
+	/* Read Ctrl B and clear PIE/AIE/UIE. */
+	rtc->write(rtc, RTC_CTRL_B,
+		   (rtc->read(rtc, RTC_CTRL_B) & ~(RTC_CTRL_B_PAU_MASK)));
+
+	/* Reading Ctrl C auto-clears PF/AF/UF. */
+	rtc->read(rtc, RTC_CTRL_C);
+
+	/* Read Ctrl 4B and clear RIE/WIE/KSE. */
+	rtc->write(rtc, RTC_EXT_CTRL_4B,
+		   (rtc->read(rtc, RTC_EXT_CTRL_4B) & ~(RTC_CTRL_4B_RWK_MASK)));
+
+	/* Clear RF/WF/KF in Ctrl 4A. */
+	rtc->write(rtc, RTC_EXT_CTRL_4A,
+		   (rtc->read(rtc, RTC_EXT_CTRL_4A) & ~(RTC_CTRL_4A_RWK_MASK)));
+
+	/*
+	 * Re-enable KSE to handle power button events.  We do not enable
+	 * WIE or RIE by default.
+	 */
+	rtc->write(rtc, RTC_EXT_CTRL_4B,
+		   (rtc->read(rtc, RTC_EXT_CTRL_4B) | RTC_CTRL_4B_KSE));
+
+	/*
+	 * Fetch the IRQ and setup the interrupt handler.
+	 *
+	 * Not all platforms have the IRQF pin tied to something.  If not, the
+	 * RTC will still set the *IE / *F flags and raise IRQF in ctrlc, but
+	 * there won't be an automatic way of notifying the kernel about it,
+	 * unless ctrlc is explicitly polled.
+	 */
+	if (!pdata->no_irq) {
+		ret = platform_get_irq(pdev, 0);
+		if (ret > 0) {
+			rtc->irq_num = ret;
+
+			/* Request an IRQ. */
+			ret = devm_request_irq(&pdev->dev, rtc->irq_num,
+					       ds1685_rtc_irq_handler,
+					       IRQF_SHARED, pdev->name, pdev);
+
+			/* Check to see if something came back. */
+			if (unlikely(ret)) {
+				dev_warn(&pdev->dev,
+					 "RTC interrupt not available\n");
+				rtc->irq_num = 0;
+			}
+		} else
+			return ret;
+	}
+	rtc->no_irq = pdata->no_irq;
+
+	/* Setup complete. */
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	/* Register the device as an RTC. */
+	rtc_dev = rtc_device_register(pdev->name, &pdev->dev,
+				      &ds1685_rtc_ops, THIS_MODULE);
+
+	/* Success? */
+	if (IS_ERR(rtc_dev))
+		return PTR_ERR(rtc_dev);
+
+	/* Maximum periodic rate is 8192Hz (0.122070ms). */
+	rtc_dev->max_user_freq = RTC_MAX_USER_FREQ;
+
+	/* See if the platform doesn't support UIE. */
+	if (pdata->uie_unsupported)
+		rtc_dev->uie_unsupported = 1;
+	rtc->uie_unsupported = pdata->uie_unsupported;
+
+	rtc->dev = rtc_dev;
+
+#ifdef CONFIG_SYSFS
+	ret = ds1685_rtc_sysfs_register(&pdev->dev);
+	if (ret)
+		rtc_device_unregister(rtc->dev);
+#endif
+
+	/* Done! */
+	return ret;
+}
+
+/**
+ * ds1685_rtc_remove - removes rtc driver.
+ * @pdev: pointer to platform_device structure.
+ */
+static int
+ds1685_rtc_remove(struct platform_device *pdev)
+{
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+
+#ifdef CONFIG_SYSFS
+	ds1685_rtc_sysfs_unregister(&pdev->dev);
+#endif
+
+	rtc_device_unregister(rtc->dev);
+
+	/* Read Ctrl B and clear PIE/AIE/UIE. */
+	rtc->write(rtc, RTC_CTRL_B,
+		   (rtc->read(rtc, RTC_CTRL_B) &
+		    ~(RTC_CTRL_B_PAU_MASK)));
+
+	/* Reading Ctrl C auto-clears PF/AF/UF. */
+	rtc->read(rtc, RTC_CTRL_C);
+
+	/* Read Ctrl 4B and clear RIE/WIE/KSE. */
+	rtc->write(rtc, RTC_EXT_CTRL_4B,
+		   (rtc->read(rtc, RTC_EXT_CTRL_4B) &
+		    ~(RTC_CTRL_4B_RWK_MASK)));
+
+	/* Manually clear RF/WF/KF in Ctrl 4A. */
+	rtc->write(rtc, RTC_EXT_CTRL_4A,
+		   (rtc->read(rtc, RTC_EXT_CTRL_4A) &
+		    ~(RTC_CTRL_4A_RWK_MASK)));
+
+	cancel_work_sync(&rtc->work);
+
+	return 0;
+}
+
+/**
+ * ds1685_rtc_driver - rtc driver properties.
+ */
+static struct platform_driver ds1685_rtc_driver = {
+	.driver		= {
+		.name	= "rtc-ds1685",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= ds1685_rtc_probe,
+	.remove		= ds1685_rtc_remove,
+};
+
+/**
+ * ds1685_rtc_init - rtc module init.
+ */
+static int __init
+ds1685_rtc_init(void)
+{
+	return platform_driver_register(&ds1685_rtc_driver);
+}
+
+/**
+ * ds1685_rtc_exit - rtc module exit.
+ */
+static void __exit
+ds1685_rtc_exit(void)
+{
+	platform_driver_unregister(&ds1685_rtc_driver);
+}
+
+module_init(ds1685_rtc_init);
+module_exit(ds1685_rtc_exit);
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* Poweroff function */
+
+/**
+ * ds1685_rtc_poweroff - uses the RTC chip to power the system off.
+ * @pdev: pointer to platform_device structure.
+ */
+extern void __noreturn
+ds1685_rtc_poweroff(struct platform_device *pdev)
+{
+	u8 ctrla, ctrl4a, ctrl4b;
+	struct ds1685_priv *rtc;
+
+	/* Check for valid RTC data, else, spin forever. */
+	if (unlikely(!pdev)) {
+		pr_emerg("rtc-ds1685: platform device data not available, spinning forever ...\n");
+		unreachable();
+	} else {
+		/* Get the rtc data. */
+		rtc = platform_get_drvdata(pdev);
+
+		/*
+		 * Disable our IRQ.  We're powering down, so we're not
+		 * going to worry about cleaning up.  Most of that should
+		 * have been taken care of by the shutdown scripts and this
+		 * is the final function call.
+		 */
+		if (!rtc->no_irq)
+			disable_irq_nosync(rtc->irq_num);
+
+		/* Oscillator must be on and the countdown chain enabled. */
+		ctrla = rtc->read(rtc, RTC_CTRL_A);
+		ctrla |= RTC_CTRL_A_DV1;
+		ctrla &= ~(RTC_CTRL_A_DV2);
+		rtc->write(rtc, RTC_CTRL_A, ctrla);
+
+		/*
+		 * Read Control 4A and check the status of the auxillary
+		 * battery.  This must be present and working (VRT2 = 1)
+		 * for wakeup and kickstart functionality to be useful.
+		 */
+		ds1685_rtc_switch_to_bank1(rtc);
+		ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A);
+		if (ctrl4a & RTC_CTRL_4A_VRT2) {
+			/* Clear all of the interrupt flags on Control 4A. */
+			ctrl4a &= ~(RTC_CTRL_4A_RWK_MASK);
+			rtc->write(rtc, RTC_EXT_CTRL_4A, ctrl4a);
+
+			/*
+			 * The auxillary battery is present and working.
+			 * Enable extended functions (ABE=1), enable
+			 * wake-up (WIE=1), and enable kickstart (KSE=1)
+			 * in Control 4B.
+			 */
+			ctrl4b = rtc->read(rtc, RTC_EXT_CTRL_4B);
+			ctrl4b |= (RTC_CTRL_4B_ABE | RTC_CTRL_4B_WIE |
+				   RTC_CTRL_4B_KSE);
+			rtc->write(rtc, RTC_EXT_CTRL_4B, ctrl4b);
+		}
+
+		/* Set PAB to 1 in Control 4A to power the system down. */
+		dev_warn(&pdev->dev, "Powerdown.\n");
+		msleep(20);
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (ctrl4a | RTC_CTRL_4A_PAB));
+
+		/* Spin ... we do not switch back to bank0. */
+		unreachable();
+	}
+}
+EXPORT_SYMBOL(ds1685_rtc_poweroff);
+/* ----------------------------------------------------------------------- */
+
+
+MODULE_AUTHOR("Joshua Kinard <kumba@gentoo.org>");
+MODULE_AUTHOR("Matthias Fuchs <matthias.fuchs@esd-electronics.com>");
+MODULE_DESCRIPTION("Dallas/Maxim DS1685/DS1687-series RTC driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
+MODULE_ALIAS("platform:rtc-ds1685");
diff --git a/include/linux/rtc/ds1685.h b/include/linux/rtc/ds1685.h
new file mode 100644
index 000000000000..e6337a56d741
--- /dev/null
+++ b/include/linux/rtc/ds1685.h
@@ -0,0 +1,375 @@
+/*
+ * Definitions for the registers, addresses, and platform data of the
+ * DS1685/DS1687-series RTC chips.
+ *
+ * This Driver also works for the DS17X85/DS17X87 RTC chips.  Functionally
+ * similar to the DS1685/DS1687, they support a few extra features which
+ * include larger, battery-backed NV-SRAM, burst-mode access, and an RTC
+ * write counter.
+ *
+ * Copyright (C) 2011-2014 Joshua Kinard <kumba@gentoo.org>.
+ * Copyright (C) 2009 Matthias Fuchs <matthias.fuchs@esd-electronics.com>.
+ *
+ * References:
+ *    DS1685/DS1687 3V/5V Real-Time Clocks, 19-5215, Rev 4/10.
+ *    DS17x85/DS17x87 3V/5V Real-Time Clocks, 19-5222, Rev 4/10.
+ *    DS1689/DS1693 3V/5V Serialized Real-Time Clocks, Rev 112105.
+ *    Application Note 90, Using the Multiplex Bus RTC Extended Features.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_RTC_DS1685_H_
+#define _LINUX_RTC_DS1685_H_
+
+#include <linux/rtc.h>
+#include <linux/platform_device.h>
+#include <linux/workqueue.h>
+
+/**
+ * struct ds1685_priv - DS1685 private data structure.
+ * @dev: pointer to the rtc_device structure.
+ * @regs: iomapped base address pointer of the RTC registers.
+ * @regstep: padding/step size between registers (optional).
+ * @baseaddr: base address of the RTC device.
+ * @size: resource size.
+ * @lock: private lock variable for spin locking/unlocking.
+ * @work: private workqueue.
+ * @irq: IRQ number assigned to the RTC device.
+ * @prepare_poweroff: pointer to platform pre-poweroff function.
+ * @wake_alarm: pointer to platform wake alarm function.
+ * @post_ram_clear: pointer to platform post ram-clear function.
+ */
+struct ds1685_priv {
+	struct rtc_device *dev;
+	void __iomem *regs;
+	u32 regstep;
+	resource_size_t baseaddr;
+	size_t size;
+	spinlock_t lock;
+	struct work_struct work;
+	int irq_num;
+	bool bcd_mode;
+	bool no_irq;
+	bool uie_unsupported;
+	bool alloc_io_resources;
+	u8 (*read)(struct ds1685_priv *, int);
+	void (*write)(struct ds1685_priv *, int, u8);
+	void (*prepare_poweroff)(void);
+	void (*wake_alarm)(void);
+	void (*post_ram_clear)(void);
+};
+
+
+/**
+ * struct ds1685_rtc_platform_data - platform data structure.
+ * @plat_prepare_poweroff: platform-specific pre-poweroff function.
+ * @plat_wake_alarm: platform-specific wake alarm function.
+ * @plat_post_ram_clear: platform-specific post ram-clear function.
+ *
+ * If your platform needs to use a custom padding/step size between
+ * registers, or uses one or more of the extended interrupts and needs special
+ * handling, then include this header file in your platform definition and
+ * set regstep and the plat_* pointers as appropriate.
+ */
+struct ds1685_rtc_platform_data {
+	const u32 regstep;
+	const bool bcd_mode;
+	const bool no_irq;
+	const bool uie_unsupported;
+	const bool alloc_io_resources;
+	u8 (*plat_read)(struct ds1685_priv *, int);
+	void (*plat_write)(struct ds1685_priv *, int, u8);
+	void (*plat_prepare_poweroff)(void);
+	void (*plat_wake_alarm)(void);
+	void (*plat_post_ram_clear)(void);
+};
+
+
+/*
+ * Time Registers.
+ */
+#define RTC_SECS		0x00	/* Seconds 00-59 */
+#define RTC_SECS_ALARM		0x01	/* Alarm Seconds 00-59 */
+#define RTC_MINS		0x02	/* Minutes 00-59 */
+#define RTC_MINS_ALARM		0x03	/* Alarm Minutes 00-59 */
+#define RTC_HRS			0x04	/* Hours 01-12 AM/PM || 00-23 */
+#define RTC_HRS_ALARM		0x05	/* Alarm Hours 01-12 AM/PM || 00-23 */
+#define RTC_WDAY		0x06	/* Day of Week 01-07 */
+#define RTC_MDAY		0x07	/* Day of Month 01-31 */
+#define RTC_MONTH		0x08	/* Month 01-12 */
+#define RTC_YEAR		0x09	/* Year 00-99 */
+#define RTC_CENTURY		0x48	/* Century 00-99 */
+#define RTC_MDAY_ALARM		0x49	/* Alarm Day of Month 01-31 */
+
+
+/*
+ * Bit masks for the Time registers in BCD Mode (DM = 0).
+ */
+#define RTC_SECS_BCD_MASK	0x7f	/* - x x x x x x x */
+#define RTC_MINS_BCD_MASK	0x7f	/* - x x x x x x x */
+#define RTC_HRS_12_BCD_MASK	0x1f	/* - - - x x x x x */
+#define RTC_HRS_24_BCD_MASK	0x3f	/* - - x x x x x x */
+#define RTC_MDAY_BCD_MASK	0x3f	/* - - x x x x x x */
+#define RTC_MONTH_BCD_MASK	0x1f	/* - - - x x x x x */
+#define RTC_YEAR_BCD_MASK	0xff	/* x x x x x x x x */
+
+/*
+ * Bit masks for the Time registers in BIN Mode (DM = 1).
+ */
+#define RTC_SECS_BIN_MASK	0x3f	/* - - x x x x x x */
+#define RTC_MINS_BIN_MASK	0x3f	/* - - x x x x x x */
+#define RTC_HRS_12_BIN_MASK	0x0f	/* - - - - x x x x */
+#define RTC_HRS_24_BIN_MASK	0x1f	/* - - - x x x x x */
+#define RTC_MDAY_BIN_MASK	0x1f	/* - - - x x x x x */
+#define RTC_MONTH_BIN_MASK	0x0f	/* - - - - x x x x */
+#define RTC_YEAR_BIN_MASK	0x7f	/* - x x x x x x x */
+
+/*
+ * Bit masks common for the Time registers in BCD or BIN Mode.
+ */
+#define RTC_WDAY_MASK		0x07	/* - - - - - x x x */
+#define RTC_CENTURY_MASK	0xff	/* x x x x x x x x */
+#define RTC_MDAY_ALARM_MASK	0xff	/* x x x x x x x x */
+#define RTC_HRS_AMPM_MASK	BIT(7)	/* Mask for the AM/PM bit */
+
+
+
+/*
+ * Control Registers.
+ */
+#define RTC_CTRL_A		0x0a	/* Control Register A */
+#define RTC_CTRL_B		0x0b	/* Control Register B */
+#define RTC_CTRL_C		0x0c	/* Control Register C */
+#define RTC_CTRL_D		0x0d	/* Control Register D */
+#define RTC_EXT_CTRL_4A		0x4a	/* Extended Control Register 4A */
+#define RTC_EXT_CTRL_4B		0x4b	/* Extended Control Register 4B */
+
+
+/*
+ * Bit names in Control Register A.
+ */
+#define RTC_CTRL_A_UIP		BIT(7)	/* Update In Progress */
+#define RTC_CTRL_A_DV2		BIT(6)	/* Countdown Chain */
+#define RTC_CTRL_A_DV1		BIT(5)	/* Oscillator Enable */
+#define RTC_CTRL_A_DV0		BIT(4)	/* Bank Select */
+#define RTC_CTRL_A_RS2		BIT(2)	/* Rate-Selection Bit 2 */
+#define RTC_CTRL_A_RS3		BIT(3)	/* Rate-Selection Bit 3 */
+#define RTC_CTRL_A_RS1		BIT(1)	/* Rate-Selection Bit 1 */
+#define RTC_CTRL_A_RS0		BIT(0)	/* Rate-Selection Bit 0 */
+#define RTC_CTRL_A_RS_MASK	0x0f	/* RS3 + RS2 + RS1 + RS0 */
+
+/*
+ * Bit names in Control Register B.
+ */
+#define RTC_CTRL_B_SET		BIT(7)	/* SET Bit */
+#define RTC_CTRL_B_PIE		BIT(6)	/* Periodic-Interrupt Enable */
+#define RTC_CTRL_B_AIE		BIT(5)	/* Alarm-Interrupt Enable */
+#define RTC_CTRL_B_UIE		BIT(4)	/* Update-Ended Interrupt-Enable */
+#define RTC_CTRL_B_SQWE		BIT(3)	/* Square-Wave Enable */
+#define RTC_CTRL_B_DM		BIT(2)	/* Data Mode */
+#define RTC_CTRL_B_2412		BIT(1)	/* 12-Hr/24-Hr Mode */
+#define RTC_CTRL_B_DSE		BIT(0)	/* Daylight Savings Enable */
+#define RTC_CTRL_B_PAU_MASK	0x70	/* PIE + AIE + UIE */
+
+
+/*
+ * Bit names in Control Register C.
+ *
+ * BIT(0), BIT(1), BIT(2), & BIT(3) are unused, always return 0, and cannot
+ * be written to.
+ */
+#define RTC_CTRL_C_IRQF		BIT(7)	/* Interrupt-Request Flag */
+#define RTC_CTRL_C_PF		BIT(6)	/* Periodic-Interrupt Flag */
+#define RTC_CTRL_C_AF		BIT(5)	/* Alarm-Interrupt Flag */
+#define RTC_CTRL_C_UF		BIT(4)	/* Update-Ended Interrupt Flag */
+#define RTC_CTRL_C_PAU_MASK	0x70	/* PF + AF + UF */
+
+
+/*
+ * Bit names in Control Register D.
+ *
+ * BIT(0) through BIT(6) are unused, always return 0, and cannot
+ * be written to.
+ */
+#define RTC_CTRL_D_VRT		BIT(7)	/* Valid RAM and Time */
+
+
+/*
+ * Bit names in Extended Control Register 4A.
+ *
+ * On the DS1685/DS1687/DS1689/DS1693, BIT(4) and BIT(5) are reserved for
+ * future use.  They can be read from and written to, but have no effect
+ * on the RTC's operation.
+ *
+ * On the DS17x85/DS17x87, BIT(5) is Burst-Mode Enable (BME), and allows
+ * access to the extended NV-SRAM by automatically incrementing the address
+ * register when they are read from or written to.
+ */
+#define RTC_CTRL_4A_VRT2	BIT(7)	/* Auxillary Battery Status */
+#define RTC_CTRL_4A_INCR	BIT(6)	/* Increment-in-Progress Status */
+#define RTC_CTRL_4A_PAB		BIT(3)	/* Power-Active Bar Control */
+#define RTC_CTRL_4A_RF		BIT(2)	/* RAM-Clear Flag */
+#define RTC_CTRL_4A_WF		BIT(1)	/* Wake-Up Alarm Flag */
+#define RTC_CTRL_4A_KF		BIT(0)	/* Kickstart Flag */
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+#define RTC_CTRL_4A_BME		BIT(5)	/* Burst-Mode Enable */
+#endif
+#define RTC_CTRL_4A_RWK_MASK	0x07	/* RF + WF + KF */
+
+
+/*
+ * Bit names in Extended Control Register 4B.
+ */
+#define RTC_CTRL_4B_ABE		BIT(7)	/* Auxillary Battery Enable */
+#define RTC_CTRL_4B_E32K	BIT(6)	/* Enable 32.768Hz on SQW Pin */
+#define RTC_CTRL_4B_CS		BIT(5)	/* Crystal Select */
+#define RTC_CTRL_4B_RCE		BIT(4)	/* RAM Clear-Enable */
+#define RTC_CTRL_4B_PRS		BIT(3)	/* PAB Reset-Select */
+#define RTC_CTRL_4B_RIE		BIT(2)	/* RAM Clear-Interrupt Enable */
+#define RTC_CTRL_4B_WIE		BIT(1)	/* Wake-Up Alarm-Interrupt Enable */
+#define RTC_CTRL_4B_KSE		BIT(0)	/* Kickstart Interrupt-Enable */
+#define RTC_CTRL_4B_RWK_MASK	0x07	/* RIE + WIE + KSE */
+
+
+/*
+ * Misc register names in Bank 1.
+ *
+ * The DV0 bit in Control Register A must be set to 1 for these registers
+ * to become available, including Extended Control Registers 4A & 4B.
+ */
+#define RTC_BANK1_SSN_MODEL	0x40	/* Model Number */
+#define RTC_BANK1_SSN_BYTE_1	0x41	/* 1st Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_2	0x42	/* 2nd Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_3	0x43	/* 3rd Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_4	0x44	/* 4th Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_5	0x45	/* 5th Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_6	0x46	/* 6th Byte of Serial Number */
+#define RTC_BANK1_SSN_CRC	0x47	/* Serial CRC Byte */
+#define RTC_BANK1_RAM_DATA_PORT	0x53	/* Extended RAM Data Port */
+
+
+/*
+ * Model-specific registers in Bank 1.
+ *
+ * The addresses below differ depending on the model of the RTC chip
+ * selected in the kernel configuration.  Not all of these features are
+ * supported in the main driver at present.
+ *
+ * DS1685/DS1687   - Extended NV-SRAM address (LSB only).
+ * DS1689/DS1693   - Vcc, Vbat, Pwr Cycle Counters & Customer-specific S/N.
+ * DS17x85/DS17x87 - Extended NV-SRAM addresses (MSB & LSB) & Write counter.
+ */
+#if defined(CONFIG_RTC_DRV_DS1685)
+#define RTC_BANK1_RAM_ADDR	0x50	/* NV-SRAM Addr */
+#elif defined(CONFIG_RTC_DRV_DS1689)
+#define RTC_BANK1_VCC_CTR_LSB	0x54	/* Vcc Counter Addr (LSB) */
+#define RTC_BANK1_VCC_CTR_MSB	0x57	/* Vcc Counter Addr (MSB) */
+#define RTC_BANK1_VBAT_CTR_LSB	0x58	/* Vbat Counter Addr (LSB) */
+#define RTC_BANK1_VBAT_CTR_MSB	0x5b	/* Vbat Counter Addr (MSB) */
+#define RTC_BANK1_PWR_CTR_LSB	0x5c	/* Pwr Cycle Counter Addr (LSB) */
+#define RTC_BANK1_PWR_CTR_MSB	0x5d	/* Pwr Cycle Counter Addr (MSB) */
+#define RTC_BANK1_UNIQ_SN	0x60	/* Customer-specific S/N */
+#else /* DS17x85/DS17x87 */
+#define RTC_BANK1_RAM_ADDR_LSB	0x50	/* NV-SRAM Addr (LSB) */
+#define RTC_BANK1_RAM_ADDR_MSB	0x51	/* NV-SRAM Addr (MSB) */
+#define RTC_BANK1_WRITE_CTR	0x5e	/* RTC Write Counter */
+#endif
+
+
+/*
+ * Model numbers.
+ *
+ * The DS1688/DS1691 and DS1689/DS1693 chips share the same model number
+ * and the manual doesn't indicate any major differences.  As such, they
+ * are regarded as the same chip in this driver.
+ */
+#define RTC_MODEL_DS1685	0x71	/* DS1685/DS1687 */
+#define RTC_MODEL_DS17285	0x72	/* DS17285/DS17287 */
+#define RTC_MODEL_DS1689	0x73	/* DS1688/DS1691/DS1689/DS1693 */
+#define RTC_MODEL_DS17485	0x74	/* DS17485/DS17487 */
+#define RTC_MODEL_DS17885	0x78	/* DS17885/DS17887 */
+
+
+/*
+ * Periodic Interrupt Rates / Square-Wave Output Frequency
+ *
+ * Periodic rates are selected by setting the RS3-RS0 bits in Control
+ * Register A and enabled via either the E32K bit in Extended Control
+ * Register 4B or the SQWE bit in Control Register B.
+ *
+ * E32K overrides the settings of RS3-RS0 and outputs a frequency of 32768Hz
+ * on the SQW pin of the RTC chip.  While there are 16 possible selections,
+ * the 1-of-16 decoder is only able to divide the base 32768Hz signal into 13
+ * smaller frequencies.  The values 0x01 and 0x02 are not used and are
+ * synonymous with 0x08 and 0x09, respectively.
+ *
+ * When E32K is set to a logic 1, periodic interrupts are disabled and reading
+ * /dev/rtc will return -EINVAL.  This also applies if the periodic interrupt
+ * frequency is set to 0Hz.
+ *
+ * Not currently used by the rtc-ds1685 driver because the RTC core removed
+ * support for hardware-generated periodic-interrupts in favour of
+ * hrtimer-generated interrupts.  But these defines are kept around for use
+ * in userland, as documentation to the hardware, and possible future use if
+ * hardware-generated periodic interrupts are ever added back.
+ */
+					/* E32K RS3 RS2 RS1 RS0 */
+#define RTC_SQW_8192HZ		0x03	/*  0    0   0   1   1  */
+#define RTC_SQW_4096HZ		0x04	/*  0    0   1   0   0  */
+#define RTC_SQW_2048HZ		0x05	/*  0    0   1   0   1  */
+#define RTC_SQW_1024HZ		0x06	/*  0    0   1   1   0  */
+#define RTC_SQW_512HZ		0x07	/*  0    0   1   1   1  */
+#define RTC_SQW_256HZ		0x08	/*  0    1   0   0   0  */
+#define RTC_SQW_128HZ		0x09	/*  0    1   0   0   1  */
+#define RTC_SQW_64HZ		0x0a	/*  0    1   0   1   0  */
+#define RTC_SQW_32HZ		0x0b	/*  0    1   0   1   1  */
+#define RTC_SQW_16HZ		0x0c	/*  0    1   1   0   0  */
+#define RTC_SQW_8HZ		0x0d	/*  0    1   1   0   1  */
+#define RTC_SQW_4HZ		0x0e	/*  0    1   1   1   0  */
+#define RTC_SQW_2HZ		0x0f	/*  0    1   1   1   1  */
+#define RTC_SQW_0HZ		0x00	/*  0    0   0   0   0  */
+#define RTC_SQW_32768HZ		32768	/*  1    -   -   -   -  */
+#define RTC_MAX_USER_FREQ	8192
+
+
+/*
+ * NVRAM data & addresses:
+ *   - 50 bytes of NVRAM are available just past the clock registers.
+ *   - 64 additional bytes are available in Bank0.
+ *
+ * Extended, battery-backed NV-SRAM:
+ *   - DS1685/DS1687    - 128 bytes.
+ *   - DS1689/DS1693    - 0 bytes.
+ *   - DS17285/DS17287  - 2048 bytes.
+ *   - DS17485/DS17487  - 4096 bytes.
+ *   - DS17885/DS17887  - 8192 bytes.
+ */
+#define NVRAM_TIME_BASE		0x0e	/* NVRAM Addr in Time regs */
+#define NVRAM_BANK0_BASE	0x40	/* NVRAM Addr in Bank0 regs */
+#define NVRAM_SZ_TIME		50
+#define NVRAM_SZ_BANK0		64
+#if defined(CONFIG_RTC_DRV_DS1685)
+#  define NVRAM_SZ_EXTND	128
+#elif defined(CONFIG_RTC_DRV_DS1689)
+#  define NVRAM_SZ_EXTND	0
+#elif defined(CONFIG_RTC_DRV_DS17285)
+#  define NVRAM_SZ_EXTND	2048
+#elif defined(CONFIG_RTC_DRV_DS17485)
+#  define NVRAM_SZ_EXTND	4096
+#elif defined(CONFIG_RTC_DRV_DS17885)
+#  define NVRAM_SZ_EXTND	8192
+#endif
+#define NVRAM_TOTAL_SZ_BANK0	(NVRAM_SZ_TIME + NVRAM_SZ_BANK0)
+#define NVRAM_TOTAL_SZ		(NVRAM_TOTAL_SZ_BANK0 + NVRAM_SZ_EXTND)
+
+
+/*
+ * Function Prototypes.
+ */
+extern void __noreturn
+ds1685_rtc_poweroff(struct platform_device *pdev);
+
+#endif /* _LINUX_RTC_DS1685_H_ */
-- 
cgit v1.2.3


From e59b4e9187bd5175b9845dc10fedb0879b7efbfd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 21 Jan 2015 20:03:40 +0000
Subject: debugfs: Provide a file creation function that also takes an initial
 size

Provide a file creation function that also takes an initial size so that the
caller doesn't have to set i_size, thus meaning that we don't have to call
deal with ->d_inode in the callers.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/infiniband/hw/cxgb4/device.c               | 35 ++++++-------------
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c |  9 ++---
 drivers/scsi/csiostor/csio_init.c                  |  9 ++---
 drivers/usb/gadget/udc/atmel_usba_udc.c            | 15 ++++----
 fs/debugfs/inode.c                                 | 40 ++++++++++++++++++++++
 include/linux/debugfs.h                            | 13 +++++++
 6 files changed, 79 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index eb5df4e62703..391309a55360 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -700,37 +700,24 @@ static const struct file_operations ep_debugfs_fops = {
 
 static int setup_debugfs(struct c4iw_dev *devp)
 {
-	struct dentry *de;
-
 	if (!devp->debugfs_root)
 		return -1;
 
-	de = debugfs_create_file("qps", S_IWUSR, devp->debugfs_root,
-				 (void *)devp, &qp_debugfs_fops);
-	if (de && de->d_inode)
-		de->d_inode->i_size = 4096;
+	debugfs_create_file_size("qps", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &qp_debugfs_fops, 4096);
 
-	de = debugfs_create_file("stags", S_IWUSR, devp->debugfs_root,
-				 (void *)devp, &stag_debugfs_fops);
-	if (de && de->d_inode)
-		de->d_inode->i_size = 4096;
+	debugfs_create_file_size("stags", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &stag_debugfs_fops, 4096);
 
-	de = debugfs_create_file("stats", S_IWUSR, devp->debugfs_root,
-			(void *)devp, &stats_debugfs_fops);
-	if (de && de->d_inode)
-		de->d_inode->i_size = 4096;
+	debugfs_create_file_size("stats", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &stats_debugfs_fops, 4096);
 
-	de = debugfs_create_file("eps", S_IWUSR, devp->debugfs_root,
-			(void *)devp, &ep_debugfs_fops);
-	if (de && de->d_inode)
-		de->d_inode->i_size = 4096;
+	debugfs_create_file_size("eps", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &ep_debugfs_fops, 4096);
 
-	if (c4iw_wr_log) {
-		de = debugfs_create_file("wr_log", S_IWUSR, devp->debugfs_root,
-					 (void *)devp, &wr_log_debugfs_fops);
-		if (de && de->d_inode)
-			de->d_inode->i_size = 4096;
-	}
+	if (c4iw_wr_log)
+		debugfs_create_file_size("wr_log", S_IWUSR, devp->debugfs_root,
+					 (void *)devp, &wr_log_debugfs_fops, 4096);
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
index c98a350d857e..4c7fe447e407 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
@@ -91,12 +91,9 @@ static const struct file_operations mem_debugfs_fops = {
 static void add_debugfs_mem(struct adapter *adap, const char *name,
 			    unsigned int idx, unsigned int size_mb)
 {
-	struct dentry *de;
-
-	de = debugfs_create_file(name, S_IRUSR, adap->debugfs_root,
-				 (void *)adap + idx, &mem_debugfs_fops);
-	if (de && de->d_inode)
-		de->d_inode->i_size = size_mb << 20;
+	debugfs_create_file_size(name, S_IRUSR, adap->debugfs_root,
+				 (void *)adap + idx, &mem_debugfs_fops,
+				 size_mb << 20);
 }
 
 /* Add an array of Debug FS files.
diff --git a/drivers/scsi/csiostor/csio_init.c b/drivers/scsi/csiostor/csio_init.c
index 34d20cc3e110..2617f15eaeeb 100644
--- a/drivers/scsi/csiostor/csio_init.c
+++ b/drivers/scsi/csiostor/csio_init.c
@@ -113,12 +113,9 @@ static const struct file_operations csio_mem_debugfs_fops = {
 void csio_add_debugfs_mem(struct csio_hw *hw, const char *name,
 				 unsigned int idx, unsigned int size_mb)
 {
-	struct dentry *de;
-
-	de = debugfs_create_file(name, S_IRUSR, hw->debugfs_root,
-				 (void *)hw + idx, &csio_mem_debugfs_fops);
-	if (de && de->d_inode)
-		de->d_inode->i_size = size_mb << 20;
+	debugfs_create_file_size(name, S_IRUSR, hw->debugfs_root,
+				 (void *)hw + idx, &csio_mem_debugfs_fops,
+				 size_mb << 20);
 }
 
 static int csio_setup_debugfs(struct csio_hw *hw)
diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c
index 9f93bed42052..b2239a2d4941 100644
--- a/drivers/usb/gadget/udc/atmel_usba_udc.c
+++ b/drivers/usb/gadget/udc/atmel_usba_udc.c
@@ -264,14 +264,17 @@ static void usba_init_debugfs(struct usba_udc *udc)
 		goto err_root;
 	udc->debugfs_root = root;
 
-	regs = debugfs_create_file("regs", 0400, root, udc, &regs_dbg_fops);
-	if (!regs)
-		goto err_regs;
-
 	regs_resource = platform_get_resource(udc->pdev, IORESOURCE_MEM,
 				CTRL_IOMEM_ID);
-	regs->d_inode->i_size = resource_size(regs_resource);
-	udc->debugfs_regs = regs;
+
+	if (regs_resource) {
+		regs = debugfs_create_file_size("regs", 0400, root, udc,
+						&regs_dbg_fops,
+						resource_size(regs_resource));
+		if (!regs)
+			goto err_regs;
+		udc->debugfs_regs = regs;
+	}
 
 	usba_ep_init_debugfs(udc, to_usba_ep(udc->gadget.ep0));
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 957c40ce09f6..45b18a5e225c 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -337,6 +337,46 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_file);
 
+/**
+ * debugfs_create_file_size - create a file in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ *        this file.
+ * @file_size: initial file size
+ *
+ * This is the basic "create a file" function for debugfs.  It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the debugfs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
+					struct dentry *parent, void *data,
+					const struct file_operations *fops,
+					loff_t file_size)
+{
+	struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
+
+	if (de)
+		de->d_inode->i_size = file_size;
+	return de;
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file_size);
+
 /**
  * debugfs_create_dir - create a directory in the debugfs filesystem
  * @name: a pointer to a string containing the name of the directory to
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index ea149a24a1f2..cb25af461054 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -51,6 +51,11 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops);
 
+struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
+					struct dentry *parent, void *data,
+					const struct file_operations *fops,
+					loff_t file_size);
+
 struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
 
 struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
@@ -129,6 +134,14 @@ static inline struct dentry *debugfs_create_file(const char *name, umode_t mode,
 	return ERR_PTR(-ENODEV);
 }
 
+static inline struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
+					struct dentry *parent, void *data,
+					const struct file_operations *fops,
+					loff_t file_size)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline struct dentry *debugfs_create_dir(const char *name,
 						struct dentry *parent)
 {
-- 
cgit v1.2.3


From 73d7e3eac01da3cef32ab25cbc6a36a6202c4ea6 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 17 Feb 2015 13:45:44 -0800
Subject: kexec: remove never used member destination in kimage

struct kimage has a member destination which is used to store the real
destination address of each page when load segment from user space buffer
to kernel.  But we never retrieve the value stored in kimage->destination,
so this member variable in kimage and its assignment operation are
redundent code.

I guess for_each_kimage_entry just does the work that kimage->destination
is expected to do.

So in this patch just make a cleanup to remove it.

Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kexec.h | 2 --
 kernel/kexec.c        | 4 ----
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 9d957b7ae095..10da8e246317 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -122,8 +122,6 @@ struct kimage {
 	kimage_entry_t *entry;
 	kimage_entry_t *last_entry;
 
-	unsigned long destination;
-
 	unsigned long start;
 	struct page *control_code_page;
 	struct page *swap_page;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c85277639b34..35dcac4b5c1c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -856,8 +856,6 @@ static int kimage_set_destination(struct kimage *image,
 
 	destination &= PAGE_MASK;
 	result = kimage_add_entry(image, destination | IND_DESTINATION);
-	if (result == 0)
-		image->destination = destination;
 
 	return result;
 }
@@ -869,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page)
 
 	page &= PAGE_MASK;
 	result = kimage_add_entry(image, page | IND_SOURCE);
-	if (result == 0)
-		image->destination += PAGE_SIZE;
 
 	return result;
 }
-- 
cgit v1.2.3


From cf2df6396ba78014289f322839a5cc785f09e1fd Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Tue, 17 Feb 2015 13:45:56 -0800
Subject: kexec: add bit definitions for kimage entry flags

Define new kexec preprocessor macros IND_*_BIT that define the bit
position of the kimage entry flags.  Change the existing IND_* flag macros
to be defined as bit shifts of the corresponding IND_*_BIT macros.  Also
wrap all C language code in kexec.h with #if !defined(__ASSEMBLY__) so
assembly files can include kexec.h to get the IND_* and IND_*_BIT macros.

Some CPU instruction sets have tests for bit position which are convenient
in implementing routines that operate on the kimage entry list.  The
addition of these bit position macros in a common location will avoid
duplicate definitions and the chance that changes to the IND_* flags will
not be propagated to assembly files.

Signed-off-by: Geoff Levand <geoff@infradead.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Maximilian Attems <max@stro.at>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kexec.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 10da8e246317..1fd980cc481b 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -1,6 +1,18 @@
 #ifndef LINUX_KEXEC_H
 #define LINUX_KEXEC_H
 
+#define IND_DESTINATION_BIT 0
+#define IND_INDIRECTION_BIT 1
+#define IND_DONE_BIT        2
+#define IND_SOURCE_BIT      3
+
+#define IND_DESTINATION  (1 << IND_DESTINATION_BIT)
+#define IND_INDIRECTION  (1 << IND_INDIRECTION_BIT)
+#define IND_DONE         (1 << IND_DONE_BIT)
+#define IND_SOURCE       (1 << IND_SOURCE_BIT)
+
+#if !defined(__ASSEMBLY__)
+
 #include <uapi/linux/kexec.h>
 
 #ifdef CONFIG_KEXEC
@@ -64,10 +76,6 @@
  */
 
 typedef unsigned long kimage_entry_t;
-#define IND_DESTINATION  0x1
-#define IND_INDIRECTION  0x2
-#define IND_DONE         0x4
-#define IND_SOURCE       0x8
 
 struct kexec_segment {
 	/*
@@ -311,4 +319,7 @@ struct task_struct;
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 #endif /* CONFIG_KEXEC */
+
+#endif /* !defined(__ASSEBMLY__) */
+
 #endif /* LINUX_KEXEC_H */
-- 
cgit v1.2.3


From b28c2ee868dbdc0baa89c60fb520be85d5e90a72 Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Tue, 17 Feb 2015 13:45:58 -0800
Subject: kexec: add IND_FLAGS macro

Add a new kexec preprocessor macro IND_FLAGS, which is the bitwise OR of
all the possible kexec IND_ kimage_entry indirection flags.  Having this
macro allows for simplified code in the prosessing of the kexec
kimage_entry items.  Also, remove the local powerpc definition and use the
generic one.

Signed-off-by: Geoff Levand <geoff@infradead.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Maximilian Attems <max@stro.at>
Cc: Michal Marek <mmarek@suse.cz>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/machine_kexec_64.c | 2 --
 include/linux/kexec.h                  | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index f96d1ec24189..1a74446fd9e5 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -96,8 +96,6 @@ int default_machine_kexec_prepare(struct kimage *image)
 	return 0;
 }
 
-#define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE)
-
 static void copy_segments(unsigned long ind)
 {
 	unsigned long entry;
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 1fd980cc481b..e60a745ac198 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -10,6 +10,7 @@
 #define IND_INDIRECTION  (1 << IND_INDIRECTION_BIT)
 #define IND_DONE         (1 << IND_DONE_BIT)
 #define IND_SOURCE       (1 << IND_SOURCE_BIT)
+#define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE)
 
 #if !defined(__ASSEMBLY__)
 
-- 
cgit v1.2.3


From 7647f14fe4cd98151f8e90656c01fe61044de714 Mon Sep 17 00:00:00 2001
From: John de la Garza <john@jjdev.com>
Date: Tue, 17 Feb 2015 13:46:04 -0800
Subject: lib/rbtree.c: fix typo in comment

Signed-off-by: John de la Garza <john@jjdev.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rbtree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 57e75ae9910f..fb31765e935a 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -51,7 +51,7 @@ struct rb_root {
 
 #define RB_EMPTY_ROOT(root)  ((root)->rb_node == NULL)
 
-/* 'empty' nodes are nodes that are known not to be inserted in an rbree */
+/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
 #define RB_EMPTY_NODE(node)  \
 	((node)->__rb_parent_color == (unsigned long)(node))
 #define RB_CLEAR_NODE(node)  \
-- 
cgit v1.2.3